Skip to content

Commit a0a020e

Browse files
committed
add llm_call_count to EpisodeStatus and update related logic in swarm server and monitoring
1 parent ac6ab47 commit a0a020e

6 files changed

Lines changed: 16 additions & 7 deletions

File tree

ajet/tuner_lib/experimental/as_oai_model_server.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,11 @@ async def chat_completions(request: Request, authorization: str = Header(None)):
169169
if ep_key(episode_uuid) not in shared_mem_dict:
170170
raise HTTPException(status_code=404, detail=f"Episode {episode_uuid} not found.")
171171

172-
# update activate timestamp
172+
# update activate timestamp and increment llm call counter
173173
with shared_mem_dict_lock:
174174
es:EpisodeStatus = shared_mem_dict[ep_key(episode_uuid)]
175175
es.latest_activity_timestamp = time.time()
176+
es.llm_call_count += 1
176177
shared_mem_dict[ep_key(episode_uuid)] = es
177178

178179
# Add to received queue

ajet/tuner_lib/experimental/as_swarm_server.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ async def _revert_episode_to_unclaimed(episode_uuid: str, shared_mem_dict, share
132132
es.episode_status = "registered"
133133
es.client_uuid = ""
134134
es.latest_activity_timestamp = time.time()
135+
es.llm_call_count = 0
135136
es.discard_episode_timeout = -1
136137
with shared_mem_dict_lock:
137138
shared_mem_dict[ep_key(episode_uuid)] = es
@@ -461,6 +462,7 @@ async def register_episode(req: RegisterEpisodeRequest):
461462
discard_episode_timeout=-1,
462463
)
463464
es.latest_activity_timestamp = time.time()
465+
es.llm_call_count = 0
464466

465467
with shared_mem_dict_lock:
466468
shared_mem_dict[ep_key(episode_uuid)] = es
@@ -519,6 +521,7 @@ async def claim_episode(req: ClaimEpisodeRequest):
519521
es.episode_type = req.episode_type
520522
es.client_uuid = req.client_uuid
521523
es.latest_activity_timestamp = time.time()
524+
es.llm_call_count = 0
522525
es.discard_episode_timeout = req.discard_episode_timeout
523526

524527
shared_mem_dict[ep_key(episode_uuid)] = es
@@ -716,6 +719,7 @@ async def get_current_batch_rollout_pool_information():
716719
"episode_status": es.episode_status,
717720
"time_since_last_activity": f"{time_since_last_activity:.1f}s",
718721
"discard_episode_timeout": f"{es.discard_episode_timeout:.1f}s",
722+
"llm_call_count": str(es.llm_call_count),
719723
}
720724
pool_info.running_episode_details = running_episode_details if running_episode_details else None
721725

ajet/tuner_lib/experimental/interchange_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ class EpisodeStatus(BaseModel):
6262
zmq_listen_result_addr: str = ""
6363
latest_activity_timestamp: float = time.time()
6464
discard_episode_timeout: float
65+
llm_call_count: int = 0
6566
debug_log: List[str] = []
6667

6768
class EpisodeBufferResponse(BaseModel):

ajet/tuner_lib/experimental/swarm_overwatch_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class CurrentBatchRolloutPoolInformation(BaseModel):
1212
completed_non_dummy_task_target: int = 0
1313
task_expected_num_repeat: int = 0
1414
completed_tasks_details: Dict[str, List[str]] = {} # task_id -> list of episode_uuids
15-
running_episode_details: Dict[str, Dict[str, str]] | None = None # episode_uuid -> { "episode_status": ..., "time_since_last_activity": ..., "discard_episode_timeout": ...}
15+
running_episode_details: Dict[str, Dict[str, str]] | None = None # episode_uuid -> { "episode_status": ..., "time_since_last_activity": ..., "discard_episode_timeout": ..., "llm_call_count": ...}
1616
engine_status: str | None = None
1717
global_step: int | None = None
1818
booting_start_time: float | None = None # timestamp when ENGINE.BOOTING started

ajet/utils/swarm_overwatch.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,12 @@ def create_running_episodes_table(
244244
)
245245

246246
table.add_column("Episode UUID", style="cyan", no_wrap=True, width=20, overflow="ellipsis")
247-
table.add_column("Status", style="green", width=15)
248-
table.add_column("Last Req / Patience", style="yellow", width=30)
247+
table.add_column("Status", style="green")
248+
table.add_column("LLM Calls", style="magenta", justify="right")
249+
table.add_column("Last Req / Patience", style="yellow")
249250

250251
if not info.running_episode_details:
251-
table.add_row("[dim]No running episodes[/dim]", "", "")
252+
table.add_row("[dim]No running episodes[/dim]", "", "", "")
252253
return table
253254

254255
# Sort by time since last activity (descending)
@@ -261,15 +262,17 @@ def create_running_episodes_table(
261262
for episode_uuid, details in sorted_episodes[:30]:
262263
last_req = details["time_since_last_activity"]
263264
patience = details.get("discard_episode_timeout", "N/A")
265+
llm_calls = details.get("llm_call_count", "0")
264266
table.add_row(
265267
episode_uuid[:40] if len(episode_uuid) > 40 else episode_uuid,
266268
details["episode_status"],
269+
llm_calls,
267270
f"{last_req} / {patience}",
268271
)
269272

270273
if len(sorted_episodes) > 30:
271274
table.add_row(
272-
f"[dim]... and {len(sorted_episodes) - 30} more episodes[/dim]", "", ""
275+
f"[dim]... and {len(sorted_episodes) - 30} more episodes[/dim]", "", "", ""
273276
)
274277

275278
return table

tutorial/example_academic_trans/trans_roll.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
LOCAL_NUM_EPOCH = 1
1414
LOCAL_MAX_PARALLEL = 32
1515
LOCAL_DATASET_PATH = "/mnt/data_cpfs/qingxu.fu/agentjet/agentjet/tmp/arxiv_papers/train.parquet"
16-
REMOTE_SWARM_URL = "http://localhost:10099" # Change to your swarm remote url
16+
REMOTE_SWARM_URL = "http://localhost:10086" # Change to your swarm remote url
1717

1818
# --------- configurations that take effect remotely -------------
1919
REMOTE_BATCH_SIZE = 8

0 commit comments

Comments
 (0)