modelscope
diff --git a/‎ajet/backbone/trainer_verl.py‎
Lines changed: 4 additions & 4 deletions b/‎ajet/backbone/trainer_verl.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ajet/default_config/ajet_default.yaml‎
Lines changed: 3 additions & 0 deletions b/‎ajet/default_config/ajet_default.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ajet/task_rollout/async_llm_bridge.py‎
Lines changed: 13 additions & 13 deletions b/‎ajet/task_rollout/async_llm_bridge.py‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎ajet/task_rollout/single_worker.py‎
Lines changed: 9 additions & 3 deletions b/‎ajet/task_rollout/single_worker.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎ajet/tuner.py‎
Lines changed: 2 additions & 0 deletions b/‎ajet/tuner.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ajet/tuner_lib/weight_tuner/as_oai_baseurl_apikey.py‎
Lines changed: 3 additions & 0 deletions b/‎ajet/tuner_lib/weight_tuner/as_oai_baseurl_apikey.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ajet/tuner_lib/weight_tuner/experimental/as_oai_model_client.py‎
Lines changed: 76 additions & 111 deletions b/‎ajet/tuner_lib/weight_tuner/experimental/as_oai_model_client.py‎
Lines changed: 76 additions & 111 deletions
@@ -834,10 +834,10 @@ def fit(self):  # noqa: C901
                 progress_bar.update(1)
                 self.global_steps += 1
 
-                # when enabled oai request interchange, we need to clear the cache from time to time
-                if self.config.ajet.enable_experimental_reverse_proxy:
-                    from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import ensure_dat_interchange_server_cache_clear
-                    ensure_dat_interchange_server_cache_clear()
+                # # when enabled oai request interchange, we need to clear the cache from time to time
+                # if self.config.ajet.enable_experimental_reverse_proxy:
+                #     from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import ensure_dat_interchange_server_cache_clear
+                #     ensure_dat_interchange_server_cache_clear()
 
                 if is_last_step:
                     pprint(f"Final validation metrics: {last_val_metrics}")
 
@@ -9,6 +9,9 @@ ajet:
   # the experimental reverse proxy feature that allows `tuner.as_oai_baseurl_apikey` feature
   enable_experimental_reverse_proxy: False
 
+  # submit llm infer submit method
+  llm_infer_submit_method: "async" # options: "sync", "async"
+
   task_runner:
     wrapper_type: "asyncio-with-gc"
     wrapper_multiprocessing_timeout: 3600  # in seconds
 
@@ -62,7 +62,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.llm_mode = llm_mode
         self.max_llm_retries = max_llm_retries
-
+        self.tool_parser = Hermes2ProToolParser(self.tokenizer)
 
     def get_llm_inference_fn_sync(self, sampling_params: dict = {}) -> Callable:  # noqa: C901
 
@@ -123,8 +123,7 @@ def llm_chat_verl(
                 and ("</tool_call>" in decoded_text)
                 and (not self.config.ajet.rollout.force_disable_toolcalls)
             ):
-                tool_parser = Hermes2ProToolParser(self.tokenizer)
-                parsed_tool_calls = tool_parser.extract_tool_calls(decoded_text, None)  # type: ignore
+                parsed_tool_calls = self.tool_parser.extract_tool_calls(decoded_text, None)  # type: ignore
                 parsed_tool_calls = parsed_tool_calls.model_dump()
                 if self.config.ajet.execute_test:
                     _test_if_test_mode(
@@ -323,8 +322,8 @@ async def llm_chat_verl(
                 and ("</tool_call>" in decoded_text)
                 and (not self.config.ajet.rollout.force_disable_toolcalls)
             ):
-                tool_parser = Hermes2ProToolParser(self.tokenizer)
-                parsed_tool_calls = tool_parser.extract_tool_calls(decoded_text, None)  # type: ignore
+
+                parsed_tool_calls = self.tool_parser.extract_tool_calls(decoded_text, None)  # type: ignore
                 parsed_tool_calls = parsed_tool_calls.model_dump()
                 if self.config.ajet.execute_test:
                     _test_if_test_mode(
@@ -535,14 +534,15 @@ async def run_infer(
             #     otherwise, for abnormal output, can still proceed, but we do not track output anymore
 
         # run llm inference ✨
-        # if sync:
-        #     llm_output = await asyncio.wait_for(
-        #         asyncio.to_thread(
-        #             self.llm_inference_fn, converted_message, custom_sampling_params, tools
-        #         ),
-        #         timeout=1800,
-        #     )
-        llm_output = await asyncio.wait_for(self.llm_inference_fn(converted_message, custom_sampling_params, tools), timeout=1800)
+        if self.config.ajet.llm_infer_submit_method == "sync":
+            llm_output = await asyncio.wait_for(
+                asyncio.to_thread(
+                    self.llm_inference_fn, converted_message, custom_sampling_params, tools
+                ),
+                timeout=1800,
+            )
+        else:
+            llm_output = await asyncio.wait_for(self.llm_inference_fn(converted_message, custom_sampling_params, tools), timeout=1800)
 
 
         # begin context tracking
 
@@ -84,9 +84,15 @@ def rollout_env_worker(
         (with validation overrides), and robust retry on transient failures.
         """
         sampling_params = get_sample_params(mode, self.config)
-        llm_inference_fn = self.async_llm_bridge.get_llm_inference_fn_async(
-            sampling_params=sampling_params
-        )
+
+        if self.config.ajet.llm_infer_submit_method == "sync":
+            llm_inference_fn = self.async_llm_bridge.get_llm_inference_fn_sync(
+                sampling_params=sampling_params
+            )
+        else:
+            llm_inference_fn = self.async_llm_bridge.get_llm_inference_fn_async(
+                sampling_params=sampling_params
+            )
 
         workflow_task = WorkflowTask(
             env_type=task.env_type,
 
@@ -114,6 +114,7 @@ def as_oai_baseurl_apikey(
             agent_name=agent_name,
             target_tag=target_tag,
             episode_uuid=self.context_tracker.episode_uuid,
+            episode_contect_address=self.interchange_client.episode_contect_address,
         )
         return baseurl_apikey_model
 
@@ -178,6 +179,7 @@ def _enable_experimental_interchange_server(self, llm_inference_fn):
                 config=self.config,
                 llm_inference_fn=llm_inference_fn,
             )
+            return self.interchange_client.begin_service()
 
 
     def terminate_episode(self):
 
@@ -12,6 +12,7 @@
 from openai.resources.chat.chat import Chat, AsyncChat
 from openai.resources.completions import AsyncCompletions
 from openai import OpenAI, AsyncOpenAI
+from ajet.utils.free_port import find_free_port
 from .experimental.as_oai_model_client import generate_auth_token
 
 if TYPE_CHECKING:
@@ -43,6 +44,7 @@ def __init__(
         target_tag: str,
         agent_name: str,
         episode_uuid: str,
+        episode_contect_address: str,
         **kwargs,
     ):
         port = os.getenv("AJET_DAT_INTERCHANGE_PORT")
@@ -52,6 +54,7 @@ def __init__(
             agent_name=agent_name,
             target_tag=target_tag,
             episode_uuid=episode_uuid,
+            episode_address=episode_contect_address,
         )
         model = "reserved_field"
 
 
@@ -1,5 +1,6 @@
 
 import asyncio
+import atexit
 import json
 import threading
 import os
@@ -9,14 +10,17 @@
 from typing import Optional, List, Dict, Any, Union, TYPE_CHECKING
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse
 from openai.types.chat.chat_completion import ChatCompletion
+from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import InterchangeCompletionRequest
 from redis.exceptions import TimeoutError
-
+from ajet.utils.free_port import find_free_port
+from ajet.utils.sington import ThreadExecutorLlmInferSingleton, ThreadExecutorSingleton
 from functools import cache
 
 import pickle
 import httpx
 import zmq
 import logging
+
 logging.getLogger("httpx").setLevel(logging.WARNING)
 
 import base64
@@ -25,7 +29,10 @@
 if TYPE_CHECKING:
     from ajet.context_tracker.multiagent_tracking import MultiAgentContextTracker
 
-def generate_auth_token(agent_name, target_tag, episode_uuid):
+DEBUG = False
+# DEBUG = True
+
+def generate_auth_token(agent_name, target_tag, episode_uuid, episode_address):
     """
     Generate a Base64-encoded auth_token from the given agent_name, target_tag, and episode_uuid.
 
@@ -41,7 +48,8 @@ def generate_auth_token(agent_name, target_tag, episode_uuid):
     auth_data = {
         "agent_name": agent_name,
         "target_tag": target_tag,
-        "episode_uuid": episode_uuid
+        "episode_uuid": episode_uuid,
+        "episode_address": episode_address,
     }
 
     # Step 2: Convert the dictionary to a JSON string
@@ -68,12 +76,15 @@ def get_redis_connection_pool():
     )
     return pool
 
-
+@cache
 def get_redis_client():
     pool = get_redis_connection_pool()
     return redis.Redis(connection_pool=pool, decode_responses=False, encoding='utf-8')
 
 
+context = zmq.Context()
+atexit.register(context.term)
+
 class InterchangeClient:
 
     def __init__(self, episode_uuid: str, context_tracker: "MultiAgentContextTracker", llm_inference_fn, config):
@@ -82,7 +93,10 @@ def __init__(self, episode_uuid: str, context_tracker: "MultiAgentContextTracker
         self.llm_inference_fn = llm_inference_fn
         self.config = config
         self._should_terminate = False
-        self.begin_service()
+
+        # self.episode_contect_address = f"tcp://localhost:{find_free_port()}"
+        self.ipc_path = f"/tmp/ajet/{self.episode_uuid}.sock"
+        self.episode_contect_address = f"ipc://{self.ipc_path}"
 
 
     async def llm_infer(
@@ -124,127 +138,78 @@ def begin_service(self):
         """
         Starts the SSE service loop.
         """
-        t = threading.Thread(target=self._begin_service_threading, daemon=True)
-        t.start()
-
-
-    def _handle_service_request(self, msg: bytes, sem: threading.Semaphore):
-        """handle a single service request in its own thread
-        """
-        from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import InterchangeCompletionRequest
-        logger.info(f"[client] {self.episode_uuid} | inside _handle_service_request")
-        redis_client = get_redis_client()
-        logger.info(f"[client] {self.episode_uuid} | get_redis_client")
-        data_as_json = ""
-        topic = ""
-        try:
-            data_as_json = json.loads(pickle.loads(msg))
-            timeline_uuid = data_as_json["timeline_uuid"]
-            topic = f"stream:timeline:{timeline_uuid}"
-            logger.info(f"[client] {self.episode_uuid} | json.loads(pickle.loads(msg))")
-
-
-            if "health_check" in data_as_json and data_as_json["health_check"]:
-                # logger.info(f"Received health check for timeline_uuid: {timeline_uuid}")
-                result = '{"health_check_ok": "True"}'
-                # logger.success(f"Health check OK for timeline_uuid: {timeline_uuid}")
-            else:
-                parsed_msg = InterchangeCompletionRequest(**data_as_json)
-                # start llm request
-                result = asyncio.run(self.llm_infer(
-                    req=parsed_msg.completion_request,
-                    timeline_uuid=parsed_msg.timeline_uuid,
-                    agent_name=parsed_msg.agent_name,
-                    target_tag=parsed_msg.target_tag,
-                    episode_uuid=parsed_msg.episode_uuid,
-                )).model_dump_json()
-                # logger.success(f"LLM inference completed for timeline_uuid: {timeline_uuid}")
-            logger.info(f"[client] {self.episode_uuid} | result = asyncio.run(self.llm_infer")
-            # send result back
-            bytes_arr = pickle.dumps(result)
-            logger.info(f"[client] {self.episode_uuid} | bytes_arr = pickle.dumps(result)")
-            redis_client.xadd(topic, {'data': bytes_arr})
-            redis_client.expire(topic, 600)  # expire after 10 mins
-            logger.info(f"[client] {self.episode_uuid} | redis_client.xadd(topic, ...)")
-
-        except Exception as e:
-            err = f"[ERR]: Error when processing data: {data_as_json} Error: {e}"
-            result = err
-            logger.error(err)
-            if topic:
-                redis_client.xadd(topic, {'data': pickle.dumps(result)})
-                redis_client.expire(topic, 600)
-
-        finally:
-            # release semaphore when done
-            sem.release()
-            redis_client.close()
+        if DEBUG: logger.info(f"[client] {self.episode_uuid} | Starting InterchangeClient service loop...")
+        self.socket = context.socket(zmq.REP)
+        self.socket.bind(f"{self.episode_contect_address}")
+        self.socket.setsockopt(zmq.RCVTIMEO, 2*1000)  # 60 秒超时
 
+        self.executor = ThreadExecutorSingleton().get_executor()
+        if DEBUG: logger.info(f"[client] {self.episode_uuid} | Submitting _begin_service_threading to executor...")
+        future = self.executor.submit(self._begin_service_threading)
+        time.sleep(1)
+        while future._state == 'PENDING':
+            time.sleep(1)
+        if DEBUG: logger.info(f"[client] {self.episode_uuid} | Future ready...")
 
+        # t = threading.Thread(target=self._begin_service_threading, daemon=True)
+        # t.start()
+        return self.episode_contect_address
 
 
     def _begin_service_threading(self):
         """begin listening for service requests in a threading model
         """
-        # logger.success(f"InterchangeClient starting for episode_uuid:{self.episode_uuid}")
-        # debug_logs = []
-        begin_time = time.time()
-        logger.info(f"[client] {self.episode_uuid} | Starting InterchangeClient service loop...")
-        redis_client = get_redis_client()
-        episode_stream = f"stream:episode:{self.episode_uuid}"
-
-        sem = threading.Semaphore(8)    # 4 concurrent requests max
-        logger.info(f"[client] {self.episode_uuid} | Listening to stream {episode_stream}, waiting for messages...")
 
-        last_id = '0-0'
-        is_init = True
+        begin_time = time.time()
+        if DEBUG: logger.info(f"[client] {self.episode_uuid} | Starting ZMQ socket bind complete")
 
         try:
             while not self.should_terminate:
-                # wait for a new message
-                logger.info(f"[client] {self.episode_uuid} | Waiting for new message on stream {episode_stream}...")
 
-                # Check messages
                 try:
-                    response = redis_client.xread({episode_stream: last_id}, count=1, block=30*1000)   # block for 30 seconds (30000 ms)
-                except TimeoutError:
-                    time.sleep(5)
-                    continue
-
-                timepassed = time.time() - begin_time
-
-                if not response:
-                    if is_init and timepassed > 30:
-                        logger.warning(f"[client] Still waiting for first message... (time passed {timepassed}) for episode_uuid:{self.episode_uuid}...")
+                    if DEBUG: logger.info(f"[client] {self.episode_uuid} | socket.recv_string() has begun")
+                    message = self.socket.recv_string()
+                    if DEBUG: logger.info(f"[client] {self.episode_uuid} | socket.recv_string() is done")
+                except zmq.Again as e:
+                    if self.should_terminate:
+                        if DEBUG: logger.info(f"[client] {self.episode_uuid} | episode over")
+                        break
+                    timepassed = time.time() - begin_time
+                    if timepassed > 60:
+                        logger.warning(f"[client] {self.episode_uuid} | Still waiting for first message... (time passed {timepassed}) for episode_uuid:{self.episode_uuid}...")
                     continue
 
-                # Got message
-                is_init = False
-                logger.info(f"[client] {self.episode_uuid} | get message...")
-
-                stream_result = response[0]
-                messages = stream_result[1]
-                msg_id, data_dict = messages[0]
-
-                last_id = msg_id
-
-                if b'data' in data_dict:
-                    msg: bytes = data_dict[b'data']
-                else:
-                    logger.error(f"Missing 'data' in stream message {msg_id}")
-                    continue
-
-                # are we free to spawn a new thread?
-                sem.acquire()
-                logger.info(f"[client] {self.episode_uuid} | sem acquire...")
-                # begin a new thread to handle this request
-                threading.Thread(target=self._handle_service_request, args=(msg, sem), daemon=True).start()
-
+                if DEBUG: logger.info(f"[client] {self.episode_uuid} | before json.loads(message)")
+                data_as_json = json.loads(message)
+                parsed_msg = InterchangeCompletionRequest(**data_as_json)
 
-        except KeyboardInterrupt:
-            return
+                if DEBUG: logger.info(f"[client] {self.episode_uuid} | before asyncio run self.llm_infer")
 
+                try:
+                    loop = asyncio.get_running_loop()
+                except:
+                    loop = asyncio.new_event_loop()
+                executor = ThreadExecutorLlmInferSingleton().get_executor()
+                future = loop.run_in_executor(
+                    executor,  # executor
+                    asyncio.run,
+                    self.llm_infer(
+                        req=parsed_msg.completion_request,
+                        timeline_uuid=parsed_msg.timeline_uuid,
+                        agent_name=parsed_msg.agent_name,
+                        target_tag=parsed_msg.target_tag,
+                        episode_uuid=parsed_msg.episode_uuid,
+                    )
+                )
+                result = loop.run_until_complete(future).model_dump_json()  # type: ignore
+
+                if DEBUG: logger.info(f"[client] {self.episode_uuid} | before send_string")
+                self.socket.send_string(result)
+        except:
+            logger.exception(f"[client] {self.episode_uuid} | Exception occurred in service loop.")
         finally:
-            redis_client.delete(episode_stream)
-            redis_client.close()
-
+            self.socket.close()
+            if DEBUG: logger.info(f"[client] {self.episode_uuid} | ZMQ socket closed, service loop terminated.")
+            if os.path.exists(self.ipc_path):
+                os.remove(self.ipc_path)
+                if DEBUG: logger.info(f"[client] {self.episode_uuid} | IPC socket file {self.ipc_path} removed.")
Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,7 @@ def as_oai_baseurl_apikey(`
`114`	`114`	`agent_name=agent_name,`
`115`	`115`	`target_tag=target_tag,`
`116`	`116`	`episode_uuid=self.context_tracker.episode_uuid,`
	`117`	`+ episode_contect_address=self.interchange_client.episode_contect_address,`
`117`	`118`	`)`
`118`	`119`	`return baseurl_apikey_model`
`119`	`120`
`@@ -178,6 +179,7 @@ def _enable_experimental_interchange_server(self, llm_inference_fn):`
`178`	`179`	`config=self.config,`
`179`	`180`	`llm_inference_fn=llm_inference_fn,`
`180`	`181`	`)`
	`182`	`+ return self.interchange_client.begin_service()`
`181`	`183`
`182`	`184`
`183`	`185`	`def terminate_episode(self):`