@@ -107,76 +107,108 @@ def stop(self):
107107 self .wake_thread .join (timeout = 2.0 )
108108
109109 def _run_wake_monitoring (self ):
110- """Background thread for reliable wake scheduling using ZSET + periodic reconciliation."""
110+ """Background thread for reliable wake scheduling using ZSET + periodic reconciliation.
111+
112+ This thread uses two mechanisms for reliability:
113+ 1. Redis keyspace notifications for immediate wake on TTL key expiry
114+ 2. Periodic ZSET reconciliation as a fallback
115+
116+ The thread will automatically reconnect on Redis connection errors.
117+ """
111118 import redis as sync_redis
112119
113120 from ..logging import setup_logger
114121
115122 logger = setup_logger (__name__ )
116- rdb = None
117- pubsub = None
118-
119- try :
120- rdb = sync_redis .Redis (
121- host = self .config .host ,
122- port = self .config .port ,
123- db = self .config .database ,
124- password = self .config .password ,
125- ssl = self .config .ssl ,
126- )
127123
128- with suppress (sync_redis .exceptions .ResponseError ):
129- rdb .config_set ("notify-keyspace-events" , "Ex" )
124+ while not self ._stop_wake_monitor .is_set ():
125+ rdb = None
126+ pubsub = None
130127
131- logger .info ("Wake monitoring thread started, processing due wakes" )
132- due_count = self ._process_due_wakes (rdb )
133- if due_count > 0 :
134- logger .info (
135- "Processed due wakes on startup" ,
136- extra = {"count" : due_count },
128+ try :
129+ rdb = sync_redis .Redis (
130+ host = self .config .host ,
131+ port = self .config .port ,
132+ db = self .config .database ,
133+ password = self .config .password ,
134+ ssl = self .config .ssl ,
137135 )
138136
139- pubsub = rdb .pubsub ()
140- pubsub .psubscribe ("__keyevent@*__:expired" )
141- last_reconciliation = time .time ()
137+ with suppress (sync_redis .exceptions .ResponseError ):
138+ rdb .config_set ("notify-keyspace-events" , "Ex" )
139+
140+ logger .info ("Wake monitoring thread started, processing due wakes" )
141+ due_count = self ._process_due_wakes (rdb )
142+ if due_count > 0 :
143+ logger .info (
144+ "Processed due wakes on startup" ,
145+ extra = {"count" : due_count },
146+ )
142147
143- while not self ._stop_wake_monitor .is_set ():
144- message = pubsub .get_message (timeout = 0.1 )
148+ pubsub = rdb .pubsub ()
149+ pubsub .psubscribe ("__keyevent@*__:expired" )
150+ last_reconciliation = time .time ()
145151
146- if message and message [ "type" ] == "pmessage" :
152+ while not self . _stop_wake_monitor . is_set () :
147153 try :
148- key = message ["data" ].decode ("utf-8" )
149- if f":{ self .app_name } :wake:" in key :
150- loop_id = key .split (":" )[- 1 ]
151- logger .info (
152- "Loop wake key expired" ,
153- extra = {"loop_id" : loop_id },
154- )
155- self ._queue_wake (rdb , loop_id )
156- elif f":{ self .app_name } :workflow_wake:" in key :
157- workflow_id = key .split (":" )[- 1 ]
158- logger .info (
159- "Workflow wake key expired" ,
160- extra = {"workflow_id" : workflow_id },
161- )
162- self ._queue_wake (rdb , workflow_id )
163- except Exception as e :
164- logger .error (f"Error processing wake notification: { e } " )
165-
166- now = time .time ()
167- if now - last_reconciliation >= WAKE_RECONCILIATION_INTERVAL_S :
168- self ._process_due_wakes (rdb )
169- last_reconciliation = now
170-
171- except Exception as e :
172- logger .error (f"Wake monitoring thread error: { e } " )
173- finally :
174- if pubsub :
175- with suppress (Exception ):
176- pubsub .close ()
177- if rdb :
178- with suppress (Exception ):
179- rdb .close ()
154+ message = pubsub .get_message (timeout = 0.1 )
155+
156+ if message and message ["type" ] == "pmessage" :
157+ try :
158+ key = message ["data" ].decode ("utf-8" )
159+ if f":{ self .app_name } :wake:" in key :
160+ loop_id = key .split (":" )[- 1 ]
161+ logger .info (
162+ "Loop wake key expired" ,
163+ extra = {"loop_id" : loop_id },
164+ )
165+ self ._queue_wake (rdb , loop_id )
166+ elif f":{ self .app_name } :workflow_wake:" in key :
167+ workflow_id = key .split (":" )[- 1 ]
168+ logger .info (
169+ "Workflow wake key expired" ,
170+ extra = {"workflow_id" : workflow_id },
171+ )
172+ self ._queue_wake (rdb , workflow_id )
173+ except Exception as e :
174+ logger .error (f"Error processing wake notification: { e } " )
175+
176+ now = time .time ()
177+ if now - last_reconciliation >= WAKE_RECONCILIATION_INTERVAL_S :
178+ due_count = self ._process_due_wakes (rdb )
179+ if due_count > 0 :
180+ logger .info (
181+ "Wake reconciliation processed due wakes" ,
182+ extra = {
183+ "count" : due_count ,
184+ "queue_size" : self .wake_queue .qsize (),
185+ },
186+ )
187+ last_reconciliation = now
188+
189+ except sync_redis .exceptions .ConnectionError as e :
190+ logger .warning (
191+ f"Redis connection error in wake monitor inner loop: { e } , reconnecting"
192+ )
193+ break # Break inner loop to reconnect
194+
195+ except sync_redis .exceptions .ConnectionError as e :
196+ logger .warning (
197+ f"Redis connection error in wake monitor: { e } , retrying in 5s"
198+ )
199+ time .sleep (5 )
200+ except Exception as e :
201+ logger .error (f"Wake monitoring thread error: { e } , retrying in 5s" )
202+ time .sleep (5 )
203+ finally :
204+ if pubsub :
205+ with suppress (Exception ):
206+ pubsub .close ()
207+ if rdb :
208+ with suppress (Exception ):
209+ rdb .close ()
210+
211+ logger .info ("Wake monitoring thread stopped" )
180212
181213 def _process_due_wakes (self , rdb ) -> int :
182214 """Process all wakes with score <= now. Returns count processed."""
0 commit comments