Skip to content

Commit 7b06ecb

Browse files
committed
fix(iroh): retry rebind on EADDRINUSE
1 parent a036a85 commit 7b06ecb

2 files changed

Lines changed: 58 additions & 17 deletions

File tree

iroh/src/socket.rs

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2370,15 +2370,6 @@ mod tests {
23702370
Ok(())
23712371
}
23722372

2373-
// Skipped on Android: the GitHub-hosted emulator's network stack
2374-
// returns EADDRINUSE long enough after force_network_change() that
2375-
// the rebind here fails and the subsequent connect() never wakes
2376-
// the connection driver. Locally on a real emulator this passes,
2377-
// so the test is only ignored under cfg(target_os = "android").
2378-
#[cfg_attr(
2379-
target_os = "android",
2380-
ignore = "rebind flakes against the GitHub Android emulator"
2381-
)]
23822373
#[tokio::test]
23832374
#[traced_test]
23842375
async fn test_regression_network_change_rebind_wakes_connection_driver() -> Result {

iroh/src/socket/transports/ip.rs

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,23 @@ use std::{
33
net::{IpAddr, SocketAddr, SocketAddrV4, SocketAddrV6},
44
num::NonZeroUsize,
55
pin::Pin,
6-
sync::Arc,
6+
sync::{Arc, Mutex},
77
task::{Context, Poll},
8+
time::Duration,
89
};
910

1011
use ipnet::{Ipv4Net, Ipv6Net};
12+
use n0_future::task::AbortOnDropHandle;
1113
use n0_watcher::Watchable;
1214
use netwatch::{UdpSender, UdpSocket};
1315
use pin_project::pin_project;
14-
use tracing::{debug, info, trace};
16+
use tokio::time;
17+
use tracing::{debug, info, trace, warn};
18+
19+
/// Total budget for retrying a rebind that fails with `EADDRINUSE`.
20+
const REBIND_RETRY_ATTEMPTS: u32 = 12;
21+
/// Delay between rebind attempts that failed with `EADDRINUSE`.
22+
const REBIND_RETRY_DELAY: Duration = Duration::from_millis(250);
1523

1624
use super::{Addr, Transmit};
1725
use crate::metrics::{EndpointMetrics, SocketMetrics};
@@ -242,6 +250,7 @@ impl IpTransport {
242250
IpNetworkChangeSender {
243251
socket: self.socket.clone(),
244252
local_addr: self.local_addr.clone(),
253+
rebind_task: Default::default(),
245254
}
246255
}
247256

@@ -259,17 +268,58 @@ impl IpTransport {
259268
pub(super) struct IpNetworkChangeSender {
260269
socket: Arc<UdpSocket>,
261270
local_addr: Watchable<SocketAddr>,
271+
rebind_task: Mutex<Option<AbortOnDropHandle<()>>>,
262272
}
263273

264274
impl IpNetworkChangeSender {
265275
pub(super) fn rebind(&self) -> io::Result<()> {
266276
let old_addr = self.local_addr.get();
267-
self.socket.rebind()?;
268-
let addr = self.socket.local_addr()?;
269-
self.local_addr.set(addr).ok();
270-
trace!("rebound from {} to {}", old_addr, addr);
271-
272-
Ok(())
277+
// Clear any previous rebind task.
278+
let mut rebind_task = self.rebind_task.lock().expect("poisoned");
279+
*rebind_task = None;
280+
// Try to rebind immediately.
281+
match self.socket.rebind() {
282+
Ok(()) => {
283+
let addr = self.socket.local_addr()?;
284+
self.local_addr.set(addr).ok();
285+
trace!("rebound from {} to {}", old_addr, addr);
286+
Ok(())
287+
}
288+
Err(err) if err.kind() == io::ErrorKind::AddrInUse => {
289+
let socket = self.socket.clone();
290+
let local_addr = self.local_addr.clone();
291+
let fut = async move {
292+
let mut attempt = 0;
293+
loop {
294+
match socket.rebind() {
295+
Ok(()) => break,
296+
Err(err)
297+
if err.kind() == io::ErrorKind::AddrInUse
298+
&& attempt < REBIND_RETRY_ATTEMPTS =>
299+
{
300+
attempt += 1;
301+
debug!(
302+
?err,
303+
attempt, "rebind hit EADDRINUSE on {old_addr}, retrying"
304+
);
305+
time::sleep(REBIND_RETRY_DELAY).await;
306+
}
307+
Err(err) => {
308+
warn!("rebinding IP transport failed: {err:#}");
309+
return;
310+
}
311+
}
312+
}
313+
if let Ok(addr) = socket.local_addr() {
314+
local_addr.set(addr).ok();
315+
trace!("rebound from {} to {}", old_addr, addr);
316+
}
317+
};
318+
*rebind_task = Some(AbortOnDropHandle::new(n0_future::task::spawn(fut)));
319+
Ok(())
320+
}
321+
Err(err) => Err(err),
322+
}
273323
}
274324

275325
pub(super) fn on_network_change(&self, _info: &crate::socket::Report) {

0 commit comments

Comments
 (0)