1- """Shared lease extension (heartbeat) tracking for TaskRunner and AsyncTaskRunner."""
1+ """Centralized lease extension (heartbeat) management for Conductor task runners.
22
3+ Architecture:
4+ LeaseManager runs a single background daemon thread that periodically checks
5+ for tasks needing lease extension heartbeats. Due heartbeats are dispatched
6+ to a small fixed ThreadPoolExecutor for parallel, non-blocking API calls.
7+
8+ This decouples heartbeat work entirely from worker poll loops, preventing
9+ heartbeat API calls (and their retries) from blocking task polling.
10+
11+ Thread-safe: track() and untrack() can be called from any thread or event loop.
12+ """
13+
14+ import logging
15+ import os
16+ import threading
17+ import time
18+ from concurrent .futures import ThreadPoolExecutor
319from dataclasses import dataclass
20+ from typing import Any , Dict , Optional
21+
22+ from conductor .client .http .models .task_result import TaskResult
23+
24+ logger = logging .getLogger (__name__ )
425
526# Lease extension constants (matches Java SDK)
627LEASE_EXTEND_RETRY_COUNT = 3
@@ -15,3 +36,189 @@ class LeaseInfo:
1536 response_timeout_seconds : float
1637 last_heartbeat_time : float # time.monotonic() of last heartbeat (or task start)
1738 interval_seconds : float # 80% of responseTimeoutSeconds
39+ task_client : Any = None # Sync TaskResourceApi for sending heartbeats
40+
41+
42+ class LeaseManager :
43+ """Centralized lease extension manager for all workers in a process.
44+
45+ One background daemon thread checks for due heartbeats at a fixed interval.
46+ A small ThreadPoolExecutor sends heartbeat API calls in parallel.
47+ Poll loops are never blocked by heartbeat work.
48+
49+ Usage:
50+ manager = LeaseManager.get_instance()
51+ manager.track(task_id, workflow_id, timeout, task_client)
52+ # ... task completes ...
53+ manager.untrack(task_id)
54+ """
55+
56+ _instance : Optional ['LeaseManager' ] = None
57+ _instance_lock = threading .Lock ()
58+ _instance_pid : Optional [int ] = None
59+
60+ @classmethod
61+ def get_instance (cls , check_interval : float = 1.0 ,
62+ max_heartbeat_workers : int = 4 ) -> 'LeaseManager' :
63+ """Get or create the process-wide LeaseManager singleton.
64+
65+ Fork-safe: a new instance is created after fork (threads don't survive fork).
66+ """
67+ current_pid = os .getpid ()
68+ if cls ._instance is None or cls ._instance_pid != current_pid :
69+ with cls ._instance_lock :
70+ if cls ._instance is None or cls ._instance_pid != current_pid :
71+ cls ._instance = cls (
72+ check_interval = check_interval ,
73+ max_heartbeat_workers = max_heartbeat_workers ,
74+ )
75+ cls ._instance_pid = current_pid
76+ return cls ._instance
77+
78+ @classmethod
79+ def _reset_instance (cls ):
80+ """Reset the singleton. For testing only."""
81+ with cls ._instance_lock :
82+ if cls ._instance is not None :
83+ cls ._instance .shutdown ()
84+ cls ._instance = None
85+ cls ._instance_pid = None
86+
87+ def __init__ (self , check_interval : float = 1.0 , max_heartbeat_workers : int = 4 ):
88+ self ._tracked : Dict [str , LeaseInfo ] = {}
89+ self ._lock = threading .Lock ()
90+ self ._executor = ThreadPoolExecutor (
91+ max_workers = max_heartbeat_workers ,
92+ thread_name_prefix = "lease-heartbeat" ,
93+ )
94+ self ._stop_event = threading .Event ()
95+ self ._check_interval = check_interval
96+ self ._thread : Optional [threading .Thread ] = None
97+ self ._started = False
98+ self ._start_lock = threading .Lock ()
99+
100+ def _ensure_started (self ) -> None :
101+ """Lazily start the background thread on first track() call."""
102+ if self ._started :
103+ return
104+ with self ._start_lock :
105+ if not self ._started :
106+ self ._thread = threading .Thread (
107+ target = self ._run , daemon = True , name = "lease-manager" ,
108+ )
109+ self ._thread .start ()
110+ self ._started = True
111+ logger .debug (
112+ "LeaseManager started (check_interval=%.1fs)" , self ._check_interval ,
113+ )
114+
115+ def track (self , task_id : str , workflow_instance_id : str ,
116+ response_timeout_seconds : float , task_client : Any ) -> None :
117+ """Start tracking a task for lease extension heartbeats.
118+
119+ Thread-safe. Can be called from any worker thread or event loop.
120+
121+ Args:
122+ task_id: Conductor task ID.
123+ workflow_instance_id: Workflow instance this task belongs to.
124+ response_timeout_seconds: The task's server-side response timeout.
125+ task_client: A **sync** TaskResourceApi for sending heartbeat API calls.
126+ """
127+ interval = response_timeout_seconds * LEASE_EXTEND_DURATION_FACTOR
128+ if interval < 1 :
129+ logger .debug (
130+ "Skipping lease tracking for task %s (interval %.1fs too short)" ,
131+ task_id , interval ,
132+ )
133+ return
134+
135+ info = LeaseInfo (
136+ task_id = task_id ,
137+ workflow_instance_id = workflow_instance_id ,
138+ response_timeout_seconds = response_timeout_seconds ,
139+ last_heartbeat_time = time .monotonic (),
140+ interval_seconds = interval ,
141+ task_client = task_client ,
142+ )
143+ with self ._lock :
144+ self ._tracked [task_id ] = info
145+ self ._ensure_started ()
146+ logger .debug (
147+ "Tracking lease for task %s (timeout=%ss, heartbeat every %ss)" ,
148+ task_id , response_timeout_seconds , interval ,
149+ )
150+
151+ def untrack (self , task_id : str ) -> None :
152+ """Stop tracking a task. Thread-safe."""
153+ with self ._lock :
154+ removed = self ._tracked .pop (task_id , None )
155+ if removed is not None :
156+ logger .debug ("Untracked lease for task %s" , task_id )
157+
158+ @property
159+ def tracked_count (self ) -> int :
160+ """Number of currently tracked tasks."""
161+ with self ._lock :
162+ return len (self ._tracked )
163+
164+ # -- Background thread -----------------------------------------------------
165+
166+ def _run (self ) -> None :
167+ """Background loop — checks for due heartbeats at fixed intervals."""
168+ while not self ._stop_event .is_set ():
169+ try :
170+ self ._check_and_send ()
171+ except Exception as e :
172+ logger .error ("LeaseManager error: %s" , e )
173+ self ._stop_event .wait (self ._check_interval )
174+
175+ def _check_and_send (self ) -> None :
176+ """Find tasks with due heartbeats and dispatch to the thread pool."""
177+ now = time .monotonic ()
178+ with self ._lock :
179+ due = [
180+ info for info in self ._tracked .values ()
181+ if now - info .last_heartbeat_time >= info .interval_seconds
182+ ]
183+ for info in due :
184+ # Update timestamp immediately to prevent double-dispatch on next tick
185+ info .last_heartbeat_time = time .monotonic ()
186+ self ._executor .submit (self ._send_heartbeat , info )
187+
188+ @staticmethod
189+ def _send_heartbeat (info : LeaseInfo ) -> None :
190+ """Send a single lease extension heartbeat with retry.
191+
192+ Runs in a pool thread — blocking retries only block the pool thread,
193+ never a poll loop.
194+ """
195+ result = TaskResult (
196+ task_id = info .task_id ,
197+ workflow_instance_id = info .workflow_instance_id ,
198+ extend_lease = True ,
199+ )
200+ for attempt in range (LEASE_EXTEND_RETRY_COUNT ):
201+ try :
202+ info .task_client .update_task (body = result )
203+ logger .debug ("Extended lease for task %s" , info .task_id )
204+ return
205+ except Exception as e :
206+ if attempt < LEASE_EXTEND_RETRY_COUNT - 1 :
207+ time .sleep (0.5 * (attempt + 2 ))
208+ else :
209+ logger .error (
210+ "Failed to extend lease for task %s after %d attempts: %s" ,
211+ info .task_id , LEASE_EXTEND_RETRY_COUNT , e ,
212+ )
213+
214+ # -- Lifecycle -------------------------------------------------------------
215+
216+ def shutdown (self ) -> None :
217+ """Stop the background thread and thread pool."""
218+ self ._stop_event .set ()
219+ if self ._started and self ._thread is not None :
220+ self ._thread .join (timeout = 5 )
221+ self ._executor .shutdown (wait = False )
222+ with self ._lock :
223+ self ._tracked .clear ()
224+ logger .debug ("LeaseManager shut down" )
0 commit comments