diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index edbf8b38fcd..6a6244aa73a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -201,18 +201,32 @@ jobs: uses: reactivecircus/android-emulator-runner@v2 env: ANDROID_NDK_HOME: ${{ steps.setup-ndk.outputs.ndk-path }} + RUST_LOG: trace with: api-level: ${{ matrix.api-level }} arch: x86_64 target: google_apis force-avd-creation: false - emulator-options: -no-window -no-audio -no-boot-anim -gpu swiftshader_indirect + # `-dns-server` + `-netfast` + `-no-metrics` are the combination + # reported to give working public-internet connectivity on the + # GH-hosted runner; without them `wlan0` stays NO-CARRIER and + # every connect() to a public IP fails with ENETUNREACH. See + # https://github.com/ReactiveCircus/android-emulator-runner/issues/348#issuecomment-2578082030 + emulator-options: -no-window -no-audio -no-boot-anim -gpu swiftshader_indirect -dns-server 8.8.8.8 -netfast -no-metrics disable-animations: true # cargo-ndk pushes each test binary to /data/local/tmp on the # emulator and runs it via adb. script: | adb wait-for-device adb shell 'i=0; while [ -z "$(getprop sys.boot_completed | tr -d "\r")" ]; do i=$((i+1)); if [ $i -gt 300 ]; then echo "boot did not complete within 600s"; exit 1; fi; sleep 2; done' + # `sys.boot_completed=1` only signals that Android finished + # booting, not that the radio/Wi-Fi stack is up. Poll until + # the kernel actually has a route to a public IP, otherwise + # any test that hits the network fails with ENETUNREACH. + adb shell 'i=0; while ! ip route get 8.8.8.8 >/dev/null 2>&1; do i=$((i+1)); if [ $i -gt 60 ]; then echo "no route to 8.8.8.8 after 60s"; ip addr; ip route; exit 1; fi; sleep 1; done' + echo "=== ip addr ===" && adb shell ip addr + echo "=== ip route ===" && adb shell ip route + echo "=== net.dns1 ===" && adb shell getprop net.dns1 cargo ndk test -p iroh-base --all-features cargo ndk test -p iroh-dns --features tls-ring cargo ndk test -p iroh-relay --features tls-ring,metrics diff --git a/iroh-dns/tests/integration.rs b/iroh-dns/tests/integration.rs index f3e14536874..3a377a6a999 100644 --- a/iroh-dns/tests/integration.rs +++ b/iroh-dns/tests/integration.rs @@ -6,25 +6,13 @@ use std::time::Duration; +#[cfg(target_os = "android")] +use iroh_dns::dns::DnsProtocol; use iroh_dns::dns::DnsResolver; const TIMEOUT: Duration = Duration::from_secs(8); const HOST: &str = "dns.iroh.link"; -#[tokio::test] -async fn resolver_constructs_without_panic() { - let _resolver = DnsResolver::new(); -} - -// Ignored on Android: in the GitHub-hosted emulator the public DNS -// fallback's hickory connection pool repeatedly returns -// "no connections available" within ~30 ms, well before the 8s -// per-lookup timeout, so a resolution that works locally fails in -// CI. Tracking the actual fix separately; see Frando/android-dns-fix. -#[cfg_attr( - target_os = "android", - ignore = "flaky on emulator (no connections available)" -)] #[tokio::test] async fn resolver_resolves_dns_iroh_link() { let resolver = DnsResolver::new(); @@ -54,3 +42,33 @@ async fn resolver_resolves_dns_iroh_link() { ); eprintln!("{HOST} resolved to: {hits:?}"); } + +/// Resolves through the Android emulator's QEMU NAT DNS proxy. +/// +/// 10.0.2.3 is the well-known emulator DNS gateway, documented at +/// . +/// Pointing the resolver at it explicitly sidesteps the missing +/// system-DNS reader (no JNI context here) so this test exercises +/// hickory's pool, sockets, and our `DnsResolver` plumbing against a +/// nameserver that is always reachable inside the emulator, +/// independent of whether public DNS is reachable on the runner. +#[cfg(target_os = "android")] +#[tokio::test] +async fn resolves_via_emulator_dns_proxy() { + let nameserver = "10.0.2.3:53".parse().unwrap(); + let resolver = DnsResolver::builder() + .with_nameserver(nameserver, DnsProtocol::Udp) + .build(); + + let addrs: Vec<_> = resolver + .lookup_ipv4(HOST, TIMEOUT) + .await + .expect("IPv4 lookup via 10.0.2.3 should succeed in the emulator") + .collect(); + + assert!( + !addrs.is_empty(), + "expected at least one A record for {HOST} via 10.0.2.3", + ); + eprintln!("{HOST} resolved via 10.0.2.3 to: {addrs:?}"); +} diff --git a/iroh/src/socket.rs b/iroh/src/socket.rs index d4943cfe872..bbb695a58e9 100644 --- a/iroh/src/socket.rs +++ b/iroh/src/socket.rs @@ -2370,15 +2370,6 @@ mod tests { Ok(()) } - // Skipped on Android: the GitHub-hosted emulator's network stack - // returns EADDRINUSE long enough after force_network_change() that - // the rebind here fails and the subsequent connect() never wakes - // the connection driver. Locally on a real emulator this passes, - // so the test is only ignored under cfg(target_os = "android"). - #[cfg_attr( - target_os = "android", - ignore = "rebind flakes against the GitHub Android emulator" - )] #[tokio::test] #[traced_test] async fn test_regression_network_change_rebind_wakes_connection_driver() -> Result { diff --git a/iroh/src/socket/transports/ip.rs b/iroh/src/socket/transports/ip.rs index 30200c8da4e..0f8765be0ee 100644 --- a/iroh/src/socket/transports/ip.rs +++ b/iroh/src/socket/transports/ip.rs @@ -3,15 +3,23 @@ use std::{ net::{IpAddr, SocketAddr, SocketAddrV4, SocketAddrV6}, num::NonZeroUsize, pin::Pin, - sync::Arc, + sync::{Arc, Mutex}, task::{Context, Poll}, + time::Duration, }; use ipnet::{Ipv4Net, Ipv6Net}; +use n0_future::task::AbortOnDropHandle; use n0_watcher::Watchable; use netwatch::{UdpSender, UdpSocket}; use pin_project::pin_project; -use tracing::{debug, info, trace}; +use tokio::time; +use tracing::{debug, info, trace, warn}; + +/// Total budget for retrying a rebind that fails with `EADDRINUSE`. +const REBIND_RETRY_ATTEMPTS: u32 = 12; +/// Delay between rebind attempts that failed with `EADDRINUSE`. +const REBIND_RETRY_DELAY: Duration = Duration::from_millis(250); use super::{Addr, Transmit}; use crate::metrics::{EndpointMetrics, SocketMetrics}; @@ -242,6 +250,7 @@ impl IpTransport { IpNetworkChangeSender { socket: self.socket.clone(), local_addr: self.local_addr.clone(), + rebind_task: Default::default(), } } @@ -259,17 +268,58 @@ impl IpTransport { pub(super) struct IpNetworkChangeSender { socket: Arc, local_addr: Watchable, + rebind_task: Mutex>>, } impl IpNetworkChangeSender { pub(super) fn rebind(&self) -> io::Result<()> { let old_addr = self.local_addr.get(); - self.socket.rebind()?; - let addr = self.socket.local_addr()?; - self.local_addr.set(addr).ok(); - trace!("rebound from {} to {}", old_addr, addr); - - Ok(()) + // Clear any previous rebind task. + let mut rebind_task = self.rebind_task.lock().expect("poisoned"); + *rebind_task = None; + // Try to rebind immediately. + match self.socket.rebind() { + Ok(()) => { + let addr = self.socket.local_addr()?; + self.local_addr.set(addr).ok(); + trace!("rebound from {} to {}", old_addr, addr); + Ok(()) + } + Err(err) if err.kind() == io::ErrorKind::AddrInUse => { + let socket = self.socket.clone(); + let local_addr = self.local_addr.clone(); + let fut = async move { + let mut attempt = 0; + loop { + match socket.rebind() { + Ok(()) => break, + Err(err) + if err.kind() == io::ErrorKind::AddrInUse + && attempt < REBIND_RETRY_ATTEMPTS => + { + attempt += 1; + debug!( + ?err, + attempt, "rebind hit EADDRINUSE on {old_addr}, retrying" + ); + time::sleep(REBIND_RETRY_DELAY).await; + } + Err(err) => { + warn!("rebinding IP transport failed: {err:#}"); + return; + } + } + } + if let Ok(addr) = socket.local_addr() { + local_addr.set(addr).ok(); + trace!("rebound from {} to {}", old_addr, addr); + } + }; + *rebind_task = Some(AbortOnDropHandle::new(n0_future::task::spawn(fut))); + Ok(()) + } + Err(err) => Err(err), + } } pub(super) fn on_network_change(&self, _info: &crate::socket::Report) { diff --git a/iroh/tests/integration.rs b/iroh/tests/integration.rs index 5e1606c58ab..d6398aaebea 100644 --- a/iroh/tests/integration.rs +++ b/iroh/tests/integration.rs @@ -32,8 +32,6 @@ use wasm_bindgen_test::wasm_bindgen_test as test; const ECHO_ALPN: &[u8] = b"echo"; -// Skipped on Android: Test is flaky in the emulator. -#[cfg_attr(target_os = "android", ignore = "flaky against staging from emulator")] #[test] async fn simple_endpoint_id_based_connection_transfer() -> Result { std::panic::set_hook(Box::new(console_error_panic_hook::hook));