@@ -30,7 +30,8 @@ class KafkaConnectionManager:
3030 'reconnect_backoff_ms' : 50 ,
3131 'reconnect_backoff_max_ms' : 30000 ,
3232 'request_timeout_ms' : 30000 ,
33- 'socket_connection_timeout_ms' : 5000 ,
33+ 'socket_connection_setup_timeout_ms' : 10000 ,
34+ 'socket_connection_setup_timeout_max_ms' : 30000 ,
3435 'socket_options' : [
3536 (socket .IPPROTO_TCP , socket .TCP_NODELAY , 1 ),
3637 (socket .SOL_SOCKET , socket .SO_KEEPALIVE , 1 ),
@@ -86,7 +87,7 @@ def __init__(self, net, **configs):
8687 )
8788 self .cluster .attach (self )
8889 self ._conns = {}
89- self ._backoff = dict () # node_id => (failures, backoff_until)
90+ self ._backoff = dict () # node_id => (failures, backoff_until, socket_connect_setup_timeout_ms )
9091 # Cache the most recent SASL / SSL / auth failure per node so we can
9192 # surface it to the user instead of silently retrying forever.
9293 # Cleared on successful connect.
@@ -119,7 +120,9 @@ async def _do_bootstrap(self, deadline):
119120 bootstrap_broker = random .choice (self .cluster .bootstrap_brokers ())
120121 log .debug ('Attempting bootstrap with %s' , bootstrap_broker )
121122 try :
123+ timeout_ms = (deadline - time .monotonic ()) * 1000 if deadline is not None else None
122124 conn = self .get_connection (bootstrap_broker .node_id ,
125+ timeout_ms = timeout_ms ,
123126 pop_on_close = False ,
124127 refresh_metadata_on_err = False ,
125128 reset_backoff_on_connect = False )
@@ -218,10 +221,11 @@ def _build_ssl_context(self):
218221 ctx .verify_flags |= ssl .VERIFY_CRL_CHECK_LEAF
219222 return ctx
220223
221- async def _build_transport (self , node ):
224+ async def _build_transport (self , node , timeout_at = None ):
222225 sock = await create_connection (self ._net , node .host , node .port ,
223226 self .config ['socket_options' ],
224- proxy_url = self .config ['proxy_url' ])
227+ proxy_url = self .config ['proxy_url' ],
228+ timeout_at = timeout_at )
225229 if self .ssl_enabled :
226230 transport = KafkaSSLTransport (self ._net , sock , self ._build_ssl_context (),
227231 host = node .host , ssl_check_hostname = self .config ['ssl_check_hostname' ])
@@ -235,11 +239,11 @@ async def _build_transport(self, node):
235239 else :
236240 return transport
237241
238- async def _connect (self , node , conn , reset_backoff_on_connect = True ):
242+ async def _connect (self , node , conn , reset_backoff_on_connect = True , timeout_at = None ):
239243 try :
240- transport = await self ._build_transport (node )
244+ transport = await self ._build_transport (node , timeout_at = timeout_at )
241245 conn .connection_made (transport )
242- await conn .init_future
246+ await conn .initialize ( timeout_at = timeout_at )
243247 except Exception as exc :
244248 log .error ('Connection failed: %s' , exc )
245249 conn .connection_lost (exc )
@@ -280,12 +284,10 @@ def get_connection(self, node_id, timeout_ms=None,
280284 if refresh_metadata_on_err :
281285 conn .close_future .add_errback (lambda _ : self .cluster .request_update ())
282286 self ._conns [node_id ] = conn
283- self ._net .call_soon (lambda : self ._connect (node , conn , reset_backoff_on_connect = reset_backoff_on_connect ))
284287 if timeout_ms is None :
285- timeout_ms = self .config ['socket_connection_timeout_ms' ]
286- self ._net .call_later (timeout_ms / 1000 ,
287- lambda : conn .close (Errors .KafkaConnectionError ('Connection timed out' ))
288- if not conn .init_future .is_done else None )
288+ timeout_ms = self .socket_connection_setup_timeout_ms (node_id )
289+ timeout_at = time .monotonic () + timeout_ms / 1000
290+ self ._net .call_soon (lambda : self ._connect (node , conn , reset_backoff_on_connect = reset_backoff_on_connect , timeout_at = timeout_at ))
289291 return conn
290292
291293 def send (self , request , node_id = None , request_timeout_ms = None ):
@@ -335,18 +337,29 @@ def reset_backoff(self, node_id):
335337 except KeyError :
336338 pass
337339
338- def reconnect_jitter_pct (self ):
340+ def jitter_pct (self ):
339341 return random .uniform (0.8 , 1.2 )
340342
343+ def _calculate_exp_timeout (self , key , failures ):
344+ max_keys = {
345+ 'reconnect_backoff_ms' : 'reconnect_backoff_max_ms' ,
346+ 'socket_connection_setup_timeout_ms' : 'socket_connection_setup_timeout_max_ms' ,
347+ }
348+ timeout_ms = self .config [key ] * 2 ** (failures - 1 )
349+ if key in max_keys :
350+ max_ms = self .config [max_keys [key ]]
351+ timeout_ms = min (max_ms , timeout_ms )
352+ return timeout_ms * self .jitter_pct ()
353+
341354 def update_backoff (self , node_id ):
342- failures , _ = self ._backoff .get (node_id , (0 , 0 ))
355+ failures , _ , _ = self ._backoff .get (node_id , (0 , 0 , 0 ))
343356 failures += 1
344- backoff_ms = self .config [ 'reconnect_backoff_ms' ] * 2 ** ( failures - 1 )
345- backoff_ms = min ( backoff_ms , self .config [ 'reconnect_backoff_max_ms' ] )
346- backoff_ms *= self . reconnect_jitter_pct ()
347- log . debug ( '%s reconnect backoff %d ms after %s failures' , node_id , backoff_ms , failures )
357+ backoff_ms = self ._calculate_exp_timeout ( 'reconnect_backoff_ms' , failures )
358+ connect_ms = self ._calculate_exp_timeout ( 'socket_connection_setup_timeout_ms' , failures )
359+ log . debug ( '%s reconnect backoff %d ms / connect timeout %d ms after %s failures' ,
360+ node_id , backoff_ms , connect_ms , failures )
348361 backoff_until_time = time .monotonic () + (backoff_ms / 1000 )
349- self ._backoff [node_id ] = (failures , backoff_until_time )
362+ self ._backoff [node_id ] = (failures , backoff_until_time , connect_ms )
350363
351364 def connection_delay (self , node_id ):
352365 """Connection delay in seconds.
@@ -357,6 +370,11 @@ def connection_delay(self, node_id):
357370 return 0
358371 return max (0 , self ._backoff [node_id ][1 ] - time .monotonic ())
359372
373+ def socket_connection_setup_timeout_ms (self , node_id ):
374+ if node_id not in self ._backoff :
375+ return self .config ['socket_connection_setup_timeout_ms' ]
376+ return self ._backoff [node_id ][2 ]
377+
360378 def auth_failure (self , node_id ):
361379 """Return the most recent auth-class failure for ``node_id``,
362380 or None if there is no sticky failure on record."""
0 commit comments