@@ -79,30 +79,19 @@ def __init__(
7979
8080 self .wake_queue : Queue [str ] = wake_queue
8181 self ._stop_wake_monitor = threading .Event ()
82-
82+
8383 if self .wake_queue :
8484 self .wake_thread = threading .Thread (
8585 target = self ._run_wake_monitoring , daemon = True
8686 )
8787 self .wake_thread .start ()
8888
8989 def _run_wake_monitoring (self ):
90- """
91- Background thread for reliable wake scheduling.
92-
93- Uses a hybrid approach for reliability:
94- 1. ZSET (sorted set) is the source of truth for all scheduled wakes
95- 2. Periodic reconciliation checks for due wakes every WAKE_RECONCILIATION_INTERVAL_S
96- 3. TTL keys + keyspace notifications provide low-latency wake for normal operation
97-
98- This ensures wakes are never missed even if:
99- - The service was down when a wake was due
100- - Keyspace notifications were missed
101- - Redis pub/sub disconnected temporarily
102- """
90+ """Background thread for reliable wake scheduling using ZSET + periodic reconciliation."""
10391 import redis as sync_redis
92+
10493 from ..logging import setup_logger
105-
94+
10695 logger = setup_logger (__name__ )
10796
10897 try :
@@ -114,23 +103,18 @@ def _run_wake_monitoring(self):
114103 ssl = self .config .ssl ,
115104 )
116105
117- # Enable keyspace notifications (best-effort, not required for reliability)
118106 with suppress (sync_redis .exceptions .ResponseError ):
119107 rdb .config_set ("notify-keyspace-events" , "Ex" )
120108
121- # Process any wakes that were missed while we were down
122109 self ._process_due_wakes (rdb )
123110
124- # Set up pub/sub for low-latency wake notifications
125111 pubsub = rdb .pubsub ()
126112 pubsub .psubscribe ("__keyevent@*__:expired" )
127-
128113 last_reconciliation = time .time ()
129114
130115 while not self ._stop_wake_monitor .is_set ():
131- # Non-blocking check for keyspace notifications
132116 message = pubsub .get_message (timeout = 0.1 )
133-
117+
134118 if message and message ["type" ] == "pmessage" :
135119 try :
136120 key = message ["data" ].decode ("utf-8" )
@@ -140,7 +124,6 @@ def _run_wake_monitoring(self):
140124 except Exception as e :
141125 logger .error (f"Error processing wake notification: { e } " )
142126
143- # Periodic reconciliation - the reliability guarantee
144127 now = time .time ()
145128 if now - last_reconciliation >= WAKE_RECONCILIATION_INTERVAL_S :
146129 self ._process_due_wakes (rdb )
@@ -150,44 +133,24 @@ def _run_wake_monitoring(self):
150133 logger .error (f"Wake monitoring thread error: { e } " )
151134
152135 def _process_due_wakes (self , rdb ) -> int :
153- """
154- Process all wakes that are due (score <= now).
155-
156- Uses ZRANGEBYSCORE to atomically get and remove due entries.
157- Returns the number of wakes processed.
158- """
136+ """Process all wakes with score <= now. Returns count processed."""
159137 schedule_key = RedisKeys .LOOP_WAKE_SCHEDULE .format (app_name = self .app_name )
160138 now = time .time ()
161139 processed = 0
162140
163- # Get all due wakes (score <= now)
164141 due_wakes : list [bytes ] = rdb .zrangebyscore (schedule_key , "-inf" , now )
165-
166142 for loop_id_bytes in due_wakes :
167143 loop_id = loop_id_bytes .decode ("utf-8" )
168-
169- # Atomically remove from schedule (only if still there with same score)
170- # This prevents double-processing in multi-replica scenarios
171- removed = rdb .zrem (schedule_key , loop_id )
172-
173- if removed :
144+ if rdb .zrem (schedule_key , loop_id ):
174145 self .wake_queue .put (loop_id )
175146 processed += 1
176147
177148 return processed
178149
179150 def _queue_wake (self , rdb , loop_id : str ) -> bool :
180- """
181- Queue a wake for a loop, removing it from the schedule.
182-
183- Returns True if the wake was queued, False if already processed.
184- """
151+ """Remove loop from schedule and queue wake. Returns True if queued."""
185152 schedule_key = RedisKeys .LOOP_WAKE_SCHEDULE .format (app_name = self .app_name )
186-
187- # Remove from schedule - if it was there, queue the wake
188- removed = rdb .zrem (schedule_key , loop_id )
189-
190- if removed :
153+ if rdb .zrem (schedule_key , loop_id ):
191154 self .wake_queue .put (loop_id )
192155 return True
193156 return False
@@ -474,27 +437,16 @@ async def pop_event(
474437 return None
475438
476439 async def set_wake_time (self , loop_id : str , timestamp : float ) -> None :
477- """
478- Schedule a wake time for a loop.
479-
480- Uses two mechanisms for reliability:
481- 1. ZSET (sorted set) - Source of truth, survives restarts
482- 2. TTL key - Triggers keyspace notification for low-latency wake
483-
484- The periodic reconciliation in _process_due_wakes ensures wakes
485- are never missed even if keyspace notifications fail.
486- """
440+ """Schedule a wake time. Uses ZSET (source of truth) + TTL key (fast notification)."""
487441 if timestamp <= time .time ():
488442 raise ValueError ("Timestamp is in the past" )
489443
490444 schedule_key = RedisKeys .LOOP_WAKE_SCHEDULE .format (app_name = self .app_name )
491445 wake_key = RedisKeys .LOOP_WAKE_KEY .format (
492446 app_name = self .app_name , loop_id = loop_id
493447 )
494-
495448 ttl_ms = max (1 , int ((timestamp - time .time ()) * 1000 ))
496449
497- # Atomic: add to schedule and set TTL key
498450 async with self .rdb .pipeline (transaction = True ) as pipe :
499451 pipe .zadd (schedule_key , {loop_id : timestamp })
500452 pipe .set (wake_key , "1" , px = ttl_ms )
0 commit comments