2020import hashlib
2121import inspect
2222import logging
23+ import sys
2324import time
2425import traceback
2526import uuid
@@ -290,6 +291,7 @@ def __init__(
290291 max_concurrent_workflow_tasks : int = 10 ,
291292 max_concurrent_activity_tasks : int = 10 ,
292293 shutdown_timeout : float = 30.0 ,
294+ heartbeat_interval : float = 60.0 ,
293295 metrics : MetricsRecorder | None = None ,
294296 interceptors : Iterable [WorkerInterceptor ] = (),
295297 ) -> None :
@@ -313,6 +315,8 @@ def __init__(
313315 raise ValueError ("max_concurrent_workflow_tasks must be at least 1" )
314316 if max_concurrent_activity_tasks < 1 :
315317 raise ValueError ("max_concurrent_activity_tasks must be at least 1" )
318+ if heartbeat_interval <= 0 :
319+ raise ValueError ("heartbeat_interval must be positive" )
316320
317321 self ._poll_timeout = poll_timeout
318322 self .max_concurrent_workflow_tasks = max_concurrent_workflow_tasks
@@ -323,6 +327,13 @@ def __init__(
323327 self ._shutdown_timeout = shutdown_timeout
324328 self ._in_flight : set [asyncio .Task [Any ]] = set ()
325329 self ._query_tasks_supported = False
330+ # In-flight slot accounting feeds the periodic heartbeat so operators
331+ # see free-slot counts without the worker having to re-derive them at
332+ # shutdown. Counters are bumped/decremented around dispatch.
333+ self ._workflow_inflight = 0
334+ self ._activity_inflight = 0
335+ self ._heartbeat_interval = float (heartbeat_interval )
336+ self ._process_started_at = time .time ()
326337 configured_metrics = metrics if metrics is not None else getattr (client , "metrics" , NOOP_METRICS )
327338 self .metrics : MetricsRecorder = configured_metrics or NOOP_METRICS
328339 self .interceptors = tuple (interceptors )
@@ -388,7 +399,7 @@ async def _register(self) -> None:
388399 _manifest_version (info .get ("worker_protocol" )),
389400 )
390401
391- await self .client .register_worker (
402+ ack = await self .client .register_worker (
392403 worker_id = self .worker_id ,
393404 task_queue = self .task_queue ,
394405 supported_workflow_types = list (self .workflows ),
@@ -398,6 +409,14 @@ async def _register(self) -> None:
398409 max_concurrent_activity_tasks = self .max_concurrent_activity_tasks ,
399410 build_id = self .build_id ,
400411 )
412+ # Adapt to the server-advertised cadence when present so a cluster
413+ # can pin the worker fleet's heartbeat beat without each worker
414+ # passing the cadence explicitly. Falls back to the constructor
415+ # value when the server has not advertised a cadence.
416+ if isinstance (ack , dict ):
417+ advertised = ack .get ("heartbeat_interval_seconds" )
418+ if isinstance (advertised , int ) and advertised > 0 :
419+ self ._heartbeat_interval = float (advertised )
401420 log .info ("worker %s registered on %s" , self .worker_id , self .task_queue )
402421
403422 async def _run_workflow_task (self , task : dict [str , Any ]) -> list [dict [str , Any ]] | None :
@@ -993,12 +1012,14 @@ async def _poll_workflow_tasks(self) -> None:
9931012 async def _dispatch_workflow_task (self , task : dict [str , Any ]) -> None :
9941013 task_start = time .perf_counter ()
9951014 outcome = "error"
1015+ self ._workflow_inflight += 1
9961016 try :
9971017 commands = await self ._run_workflow_task (task )
9981018 outcome = "completed" if commands is not None else "failed"
9991019 except Exception :
10001020 log .exception ("unhandled error in workflow task execution" )
10011021 finally :
1022+ self ._workflow_inflight = max (0 , self ._workflow_inflight - 1 )
10021023 self ._record_task_metrics ("workflow" , outcome , time .perf_counter () - task_start )
10031024 self ._wf_semaphore .release ()
10041025
@@ -1032,11 +1053,13 @@ async def _poll_activity_tasks(self) -> None:
10321053 async def _dispatch_activity_task (self , task : dict [str , Any ]) -> None :
10331054 task_start = time .perf_counter ()
10341055 outcome = "error"
1056+ self ._activity_inflight += 1
10351057 try :
10361058 outcome = await self ._run_activity_task (task )
10371059 except Exception :
10381060 log .exception ("unhandled error in activity task execution" )
10391061 finally :
1062+ self ._activity_inflight = max (0 , self ._activity_inflight - 1 )
10401063 self ._record_task_metrics ("activity" , outcome , time .perf_counter () - task_start )
10411064 self ._act_semaphore .release ()
10421065
@@ -1076,12 +1099,96 @@ async def run(self) -> None:
10761099 await self ._register ()
10771100 wf_loop = asyncio .create_task (self ._poll_workflow_tasks ())
10781101 act_loop = asyncio .create_task (self ._poll_activity_tasks ())
1079- loops = [wf_loop , act_loop ]
1102+ hb_loop = asyncio .create_task (self ._heartbeat_loop ())
1103+ loops = [wf_loop , act_loop , hb_loop ]
10801104 if self ._query_tasks_supported :
10811105 loops .append (asyncio .create_task (self ._poll_query_tasks ()))
10821106 with contextlib .suppress (asyncio .CancelledError ):
10831107 await asyncio .gather (* loops )
10841108
1109+ async def _heartbeat_loop (self ) -> None :
1110+ """Periodically refresh the server-side worker registration.
1111+
1112+ Reports current task-slot availability and basic process-level
1113+ metrics so the worker management API, CLI worker listing, and
1114+ Waterline Worker Status view can show free-slot counts and
1115+ process health alongside ``last_heartbeat_at``. Cadence is the
1116+ server-advertised ``heartbeat_interval_seconds`` (default 60s,
1117+ bounded to [1s, 1h] cluster-wide) so workers stop being
1118+ considered for task dispatch when they miss enough heartbeats.
1119+ """
1120+ while not self ._stop .is_set ():
1121+ try :
1122+ await asyncio .wait_for (self ._stop .wait (), timeout = self ._heartbeat_interval )
1123+ except asyncio .TimeoutError :
1124+ pass
1125+ if self ._stop .is_set ():
1126+ return
1127+ try :
1128+ ack = await self .client .heartbeat_worker (
1129+ worker_id = self .worker_id ,
1130+ task_slots = self ._current_task_slots (),
1131+ process_metrics = self ._current_process_metrics (),
1132+ )
1133+ except Exception as e :
1134+ log .warning ("worker heartbeat failed: %s" , e )
1135+ continue
1136+ if isinstance (ack , dict ):
1137+ advertised = ack .get ("heartbeat_interval_seconds" )
1138+ if isinstance (advertised , int ) and advertised > 0 :
1139+ self ._heartbeat_interval = float (advertised )
1140+
1141+ def _current_task_slots (self ) -> dict [str , int ]:
1142+ return {
1143+ "workflow_available" : max (
1144+ 0 , self .max_concurrent_workflow_tasks - self ._workflow_inflight
1145+ ),
1146+ "activity_available" : max (
1147+ 0 , self .max_concurrent_activity_tasks - self ._activity_inflight
1148+ ),
1149+ }
1150+
1151+ def _current_process_metrics (self ) -> dict [str , Any ]:
1152+ import os
1153+ import socket
1154+
1155+ metrics : dict [str , Any ] = {
1156+ "process_uptime_seconds" : int (time .time () - self ._process_started_at ),
1157+ "process_id" : os .getpid (),
1158+ }
1159+
1160+ try :
1161+ import resource
1162+
1163+ usage = resource .getrusage (resource .RUSAGE_SELF )
1164+ # ru_maxrss is kilobytes on Linux and bytes on macOS — normalize
1165+ # to bytes. The server stores whatever is sent so the units stay
1166+ # consistent across SDKs.
1167+ if sys .platform == "darwin" :
1168+ metrics ["memory_bytes" ] = int (usage .ru_maxrss )
1169+ else :
1170+ metrics ["memory_bytes" ] = int (usage .ru_maxrss ) * 1024
1171+
1172+ cpu_seconds = float (usage .ru_utime ) + float (usage .ru_stime )
1173+ wall_seconds = max (0.001 , time .time () - self ._process_started_at )
1174+ metrics ["cpu_percent" ] = max (
1175+ 0.0 , min (100.0 , round ((cpu_seconds / wall_seconds ) * 100.0 , 2 ))
1176+ )
1177+ except (ImportError , OSError ):
1178+ # `resource` is POSIX-only — Windows skips getrusage but still
1179+ # reports pid + uptime + host so the operator surface remains
1180+ # populated.
1181+ pass
1182+
1183+ try :
1184+ host = socket .gethostname ()
1185+ except Exception :
1186+ host = ""
1187+ if isinstance (host , str ) and host != "" :
1188+ metrics ["host" ] = host [:255 ]
1189+
1190+ return metrics
1191+
10851192 async def run_until (
10861193 self ,
10871194 * ,
0 commit comments