@@ -33,8 +33,29 @@ use crate::{AimDb, RuntimeAdapter};
3333pub struct ClientConfig {
3434 /// Redial after a dropped/failed connection instead of ending the engine.
3535 pub reconnect : bool ,
36- /// Delay before each redial when `reconnect` is set.
36+ /// Base delay before the first redial when `reconnect` is set. Subsequent
37+ /// redials grow this exponentially, capped at [`max_reconnect_delay`](Self::max_reconnect_delay).
3738 pub reconnect_delay : Duration ,
39+ /// Upper bound for the exponential reconnect backoff. Defaults to
40+ /// [`reconnect_delay`](Self::reconnect_delay) (i.e. no escalation — a fixed
41+ /// delay, preserving the pre-Phase-4 behavior).
42+ pub max_reconnect_delay : Duration ,
43+ /// Maximum redial attempts before the engine gives up. `0` = unlimited
44+ /// (the default).
45+ pub max_reconnect_attempts : usize ,
46+ /// If set, send a keepalive `Ping` on this interval while a connection is
47+ /// idle. `None` (default) disables keepalive.
48+ pub keepalive_interval : Option < Duration > ,
49+ /// Cap on caller commands buffered while disconnected; the oldest are dropped
50+ /// past this bound. Defaults to `usize::MAX` (effectively unbounded — the
51+ /// pre-Phase-4 behavior).
52+ pub max_offline_queue : usize ,
53+ /// Key the subscription demux by **topic** instead of the engine request id.
54+ /// `false` (default, AimX-style) — events carry the request id back, demux by
55+ /// id. `true` (WS-style) — the wire pushes data keyed by topic with no id, so
56+ /// the codec's `decode_outbound` returns the topic as `Event.sub` and the
57+ /// engine routes by topic.
58+ pub topic_routed_subs : bool ,
3859 /// Send a Ping handshake on connect and wait for the Pong before accepting
3960 /// caller commands (the proactive "handshake-as-caller"). Mirrors the
4061 /// server's `reads_hello`; a real protocol swaps Ping/Pong for its Hello.
@@ -46,11 +67,31 @@ impl Default for ClientConfig {
4667 Self {
4768 reconnect : true ,
4869 reconnect_delay : Duration :: from_millis ( 200 ) ,
70+ max_reconnect_delay : Duration :: from_millis ( 200 ) ,
71+ max_reconnect_attempts : 0 ,
72+ keepalive_interval : None ,
73+ max_offline_queue : usize:: MAX ,
74+ topic_routed_subs : false ,
4975 sends_hello : false ,
5076 }
5177 }
5278}
5379
80+ /// Exponential backoff for the `attempt`-th redial (1-based), capped at
81+ /// [`ClientConfig::max_reconnect_delay`]. Defaults collapse this to a fixed
82+ /// `reconnect_delay` (max == base), preserving pre-Phase-4 behavior.
83+ fn backoff_delay ( config : & ClientConfig , attempt : usize ) -> Duration {
84+ let base = config. reconnect_delay ;
85+ let cap = config. max_reconnect_delay . max ( base) ;
86+ let shift = attempt. saturating_sub ( 1 ) . min ( 16 ) as u32 ;
87+ base. saturating_mul ( 1u32 << shift) . min ( cap)
88+ }
89+
90+ /// Bound the offline backlog: drop the oldest buffered commands beyond `cap`.
91+ fn bound_offline_queue ( cmd_rx : & mut mpsc:: UnboundedReceiver < ClientCmd > , cap : usize ) {
92+ while cmd_rx. len ( ) > cap && cmd_rx. try_recv ( ) . is_ok ( ) { }
93+ }
94+
5495/// A cheap-clone handle to a running [`run_client`] engine — the caller-facing
5596/// RPC surface. Every method funnels a command to the engine, which owns the
5697/// pending-call map and the wire.
@@ -165,33 +206,62 @@ async fn client_loop<D, C>(
165206 D : Dialer ,
166207 C : EnvelopeCodec ,
167208{
209+ // Consecutive failed attempts since the last successful connection; drives
210+ // exponential backoff and the optional attempt cap.
211+ let mut attempt: usize = 0 ;
168212 loop {
169213 let conn = match dialer. connect ( ) . await {
170- Ok ( conn) => conn,
214+ Ok ( conn) => {
215+ attempt = 0 ;
216+ conn
217+ }
171218 Err ( _e) => {
172219 #[ cfg( feature = "tracing" ) ]
173220 tracing:: warn!( "client dial failed: {:?}" , _e) ;
174- if config. reconnect {
175- tokio :: time :: sleep ( config . reconnect_delay ) . await ;
176- continue ;
221+ match reconnect_after ( & mut attempt , & config, & mut cmd_rx ) . await {
222+ true => continue ,
223+ false => return ,
177224 }
178- return ;
179225 }
180226 } ;
181227
182228 match drive_connection ( conn, & codec, & mut cmd_rx, & config) . await {
183229 Ended :: HandlesDropped => return ,
184230 Ended :: Disconnected => {
185- if config. reconnect {
186- tokio :: time :: sleep ( config . reconnect_delay ) . await ;
187- continue ;
231+ match reconnect_after ( & mut attempt , & config, & mut cmd_rx ) . await {
232+ true => continue ,
233+ false => return ,
188234 }
189- return ;
190235 }
191236 }
192237 }
193238}
194239
240+ /// Decide whether to redial: honor `reconnect`, the attempt cap, the offline-queue
241+ /// bound, and the exponential backoff sleep. Returns `true` to retry, `false` to
242+ /// stop the engine.
243+ async fn reconnect_after (
244+ attempt : & mut usize ,
245+ config : & ClientConfig ,
246+ cmd_rx : & mut mpsc:: UnboundedReceiver < ClientCmd > ,
247+ ) -> bool {
248+ if !config. reconnect {
249+ return false ;
250+ }
251+ * attempt += 1 ;
252+ if config. max_reconnect_attempts != 0 && * attempt >= config. max_reconnect_attempts {
253+ #[ cfg( feature = "tracing" ) ]
254+ tracing:: warn!(
255+ "client giving up after {} reconnect attempts" ,
256+ config. max_reconnect_attempts
257+ ) ;
258+ return false ;
259+ }
260+ bound_offline_queue ( cmd_rx, config. max_offline_queue ) ;
261+ tokio:: time:: sleep ( backoff_delay ( config, * attempt) ) . await ;
262+ true
263+ }
264+
195265/// Drive one dialed [`Connection`]: optional handshake, then `biased` demux of
196266/// server frames (resolve `Reply` by `id`, route `Event`/`Snapshot` to their
197267/// subscription channels) interleaved with caller commands. Pending state is
@@ -229,6 +299,9 @@ where
229299 }
230300 }
231301
302+ // Optional keepalive ticker — `None` parks the arm forever (see below).
303+ let mut keepalive = config. keepalive_interval . map ( tokio:: time:: interval) ;
304+
232305 loop {
233306 tokio:: select! {
234307 biased;
@@ -269,10 +342,31 @@ where
269342 }
270343 }
271344 Ok ( Outbound :: Pong ) => { }
345+ // Explicit subscribe ack (WS). Informational — the local
346+ // event sink already exists from the Subscribe command, so
347+ // there is nothing to route; just confirm liveness.
348+ Ok ( Outbound :: Subscribed { .. } ) => { }
272349 Err ( _e) => continue , // skip a malformed frame, keep the connection
273350 }
274351 }
275352
353+ // ---- keepalive: send a Ping when the ticker fires --------------
354+ // With no interval configured the arm parks on `pending()` forever,
355+ // so it never wins the `select!`.
356+ _ = async {
357+ match keepalive. as_mut( ) {
358+ Some ( i) => { i. tick( ) . await ; }
359+ None => std:: future:: pending:: <( ) >( ) . await ,
360+ }
361+ } => {
362+ out. clear( ) ;
363+ if codec. encode_inbound( Inbound :: Ping , & mut out) . is_ok( )
364+ && conn. send( & out) . await . is_err( )
365+ {
366+ return Ended :: Disconnected ;
367+ }
368+ }
369+
276370 // ---- caller commands from ClientHandle -------------------------
277371 cmd = cmd_rx. recv( ) => {
278372 let cmd = match cmd {
@@ -299,7 +393,14 @@ where
299393 ClientCmd :: Subscribe { topic, events } => {
300394 let id = next_id;
301395 next_id += 1 ;
302- subs. insert( id. to_string( ) , events) ;
396+ // Topic-routed (WS): the wire pushes data keyed by topic,
397+ // so demux by topic; id-routed (AimX): events echo the id.
398+ let key = if config. topic_routed_subs {
399+ topic. clone( )
400+ } else {
401+ id. to_string( )
402+ } ;
403+ subs. insert( key, events) ;
303404 out. clear( ) ;
304405 let sent = codec
305406 . encode_inbound( Inbound :: Subscribe { id, topic } , & mut out)
0 commit comments