Skip to content

Commit 1ed969e

Browse files
committed
feat: verify origin + proxy URL respond before claiming tunnel is ready
Controllers reporting Ready (with observedGeneration in sync) still doesn't mean the data plane is actually carrying traffic — Envoy programming a route is not the same as Envoy serving it. The user reported a ~2-minute window where every condition was True but https://<proxy>/ returned 503. Whatever's behind it (xDS push lag, edge config-not-yet-loaded, iroh peer connection still settling), it's invisible from the controller's view. Add a "Verifying connectivity..." phase between the condition checklist and "Tunnel ready". Every 10s, probe in parallel: - the origin URL the user gave (so a downed local service is named explicitly instead of being blamed on the tunnel) - the public proxy URL (https://<hostname>/) Any response under 500 counts as "reachable" — 4xx like 401/404 are fine because the edge is forwarding; only 5xx + transport errors block. On each tick we print a ✓ line for newly-reachable endpoints and a … line for ones still failing, with the controller's last error so the user can act ("origin connection refused" vs "proxy 503"). New --timeout flag (default 10m, humantime) caps total setup including verification. On expiry the command exits non-zero with a per-side summary so an unverified tunnel doesn't get treated as healthy. Sleep is clamped to the remaining budget so an early success on one side doesn't waste the last 10s before bailing on the other.
1 parent d8f7c96 commit 1ed969e

3 files changed

Lines changed: 175 additions & 6 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ rand.workspace = true
2424
hex.workspace = true
2525
sentry.workspace = true
2626
rustls = { workspace = true, features = ["ring"] }
27-
inquire = "0.9.4"
27+
inquire = "0.9.4"
28+
reqwest.workspace = true

cli/src/main.rs

Lines changed: 172 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,15 @@ pub enum TunnelCommands {
266266
/// reused — useful for resuming a tunnel verbatim.
267267
#[clap(long)]
268268
id: Option<String>,
269+
/// Maximum wall-clock budget for the whole setup including the
270+
/// end-to-end connectivity verification (origin probe + proxy URL
271+
/// probe). When the controller conditions all flip Ready, the CLI
272+
/// keeps polling both URLs every 10s until they respond non-5xx.
273+
/// If this elapses with probes still failing, the command exits
274+
/// non-zero with a summary so you don't think it's healthy when
275+
/// it isn't. Accepts humantime values like "5m" or "30s".
276+
#[clap(long, default_value = "10m")]
277+
timeout: humantime::Duration,
269278
/// Skip confirmation prompt if tunnel already exists.
270279
#[clap(long, default_value = "false")]
271280
yes: bool,
@@ -627,7 +636,7 @@ async fn main() -> n0_error::Result<()> {
627636
}
628637
}
629638
}
630-
TunnelCommands::Listen { label, endpoint, id, yes } => {
639+
TunnelCommands::Listen { label, endpoint, id, timeout, yes } => {
631640
let endpoint_id = node.endpoint_id();
632641

633642
// Resolve (existing_tunnel, effective_endpoint):
@@ -749,10 +758,21 @@ async fn main() -> n0_error::Result<()> {
749758
println!("Setting up tunnel...");
750759
let progress = await_tunnel_progress(&service, &tunnel_id).await?;
751760

752-
let elapsed = progress.elapsed.as_secs();
753-
for hostname in &progress.hostnames {
754-
println!("Tunnel ready after {} sec: https://{}", elapsed, hostname);
755-
}
761+
let hostname = progress
762+
.hostnames
763+
.first()
764+
.cloned()
765+
.ok_or_else(|| n0_error::anyerr!("tunnel has no hostname after setup"))?;
766+
println!("Verifying connectivity...");
767+
let verify_start = std::time::Instant::now();
768+
let budget = (*timeout).saturating_sub(progress.elapsed);
769+
verify_endpoints(&endpoint, &hostname, budget).await?;
770+
let total = progress.elapsed + verify_start.elapsed();
771+
println!(
772+
"Tunnel ready after {} sec: https://{}",
773+
total.as_secs(),
774+
hostname,
775+
);
756776
println!("Press Ctrl+C to stop...");
757777

758778
// Watch login state so a permanent auth loss mid-session
@@ -1192,3 +1212,150 @@ async fn await_tunnel_progress(
11921212
tokio::time::sleep(PROGRESS_POLL_INTERVAL).await;
11931213
}
11941214
}
1215+
1216+
/// How often to retry the connectivity probes during the verify phase.
1217+
const VERIFY_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
1218+
1219+
/// Outcome of one probe attempt. We split "reachable but error" from
1220+
/// "couldn't even connect" so the user gets a useful failure summary
1221+
/// (different remedies for "your server is down" vs "the edge is 503ing").
1222+
#[derive(Debug, Clone)]
1223+
enum ProbeOutcome {
1224+
Ok { status: u16 },
1225+
HttpStatus { status: u16 },
1226+
NotReachable { reason: String },
1227+
}
1228+
1229+
impl ProbeOutcome {
1230+
fn ok(&self) -> bool {
1231+
matches!(self, ProbeOutcome::Ok { .. })
1232+
}
1233+
1234+
fn detail(&self) -> String {
1235+
match self {
1236+
ProbeOutcome::Ok { status } => format!("HTTP {status}"),
1237+
ProbeOutcome::HttpStatus { status } => format!("HTTP {status}"),
1238+
ProbeOutcome::NotReachable { reason } => reason.clone(),
1239+
}
1240+
}
1241+
}
1242+
1243+
/// Probe one URL. Any response under 500 (including 4xx like 401/404) is
1244+
/// considered "reachable" — the data path is forwarding even if the
1245+
/// origin chose to reject the request. Only 5xx + transport errors are
1246+
/// counted as failures so we don't false-fail on authenticated origins.
1247+
async fn probe(client: &reqwest::Client, url: &str) -> ProbeOutcome {
1248+
match client.get(url).send().await {
1249+
Ok(resp) => {
1250+
let status = resp.status().as_u16();
1251+
if status < 500 {
1252+
ProbeOutcome::Ok { status }
1253+
} else {
1254+
ProbeOutcome::HttpStatus { status }
1255+
}
1256+
}
1257+
Err(err) => ProbeOutcome::NotReachable { reason: err.to_string() },
1258+
}
1259+
}
1260+
1261+
/// After the controller conditions all flip Ready, the data plane still
1262+
/// needs a moment to actually carry traffic — Envoy programming a route
1263+
/// is not the same as Envoy serving it. Probe both the user's local
1264+
/// origin and the public proxy URL every 10 seconds until both respond
1265+
/// non-5xx, or the timeout budget runs out.
1266+
async fn verify_endpoints(
1267+
origin_url: &str,
1268+
proxy_hostname: &str,
1269+
budget: std::time::Duration,
1270+
) -> n0_error::Result<()> {
1271+
let proxy_url = format!("https://{proxy_hostname}/");
1272+
let client = reqwest::Client::builder()
1273+
// Per-request timeout shorter than the poll interval so a stuck
1274+
// request can't eat the whole 10s gap.
1275+
.timeout(std::time::Duration::from_secs(5))
1276+
.build()
1277+
.map_err(|err| n0_error::anyerr!("failed to build HTTP client: {err}"))?;
1278+
1279+
let start = std::time::Instant::now();
1280+
let mut origin_ok = false;
1281+
let mut proxy_ok = false;
1282+
let mut last_origin: Option<ProbeOutcome> = None;
1283+
let mut last_proxy: Option<ProbeOutcome> = None;
1284+
1285+
loop {
1286+
// Probe in parallel — skip the side that's already ready.
1287+
let origin_fut = async {
1288+
if origin_ok { None } else { Some(probe(&client, origin_url).await) }
1289+
};
1290+
let proxy_fut = async {
1291+
if proxy_ok { None } else { Some(probe(&client, &proxy_url).await) }
1292+
};
1293+
let (now_origin, now_proxy) = tokio::join!(origin_fut, proxy_fut);
1294+
let elapsed = start.elapsed().as_secs_f32();
1295+
1296+
if let Some(o) = now_origin {
1297+
if o.ok() {
1298+
origin_ok = true;
1299+
println!(
1300+
" ✓ origin reachable ({elapsed:.1}s) [{origin_url}]: {}",
1301+
o.detail(),
1302+
);
1303+
} else {
1304+
eprintln!(
1305+
" … origin not reachable ({elapsed:.0}s) [{origin_url}]: {}",
1306+
o.detail(),
1307+
);
1308+
}
1309+
last_origin = Some(o);
1310+
}
1311+
if let Some(p) = now_proxy {
1312+
if p.ok() {
1313+
proxy_ok = true;
1314+
println!(
1315+
" ✓ proxy responding ({elapsed:.1}s) [https://{proxy_hostname}]: {}",
1316+
p.detail(),
1317+
);
1318+
} else {
1319+
eprintln!(
1320+
" … proxy not responding ({elapsed:.0}s) [https://{proxy_hostname}]: {}",
1321+
p.detail(),
1322+
);
1323+
}
1324+
last_proxy = Some(p);
1325+
}
1326+
1327+
if origin_ok && proxy_ok {
1328+
return Ok(());
1329+
}
1330+
if start.elapsed() >= budget {
1331+
break;
1332+
}
1333+
// Don't sleep past the budget — clamp the wait so an early
1334+
// success on one side doesn't make us waste 10s before bailing.
1335+
let remaining = budget.saturating_sub(start.elapsed());
1336+
tokio::time::sleep(VERIFY_POLL_INTERVAL.min(remaining)).await;
1337+
}
1338+
1339+
let mut parts: Vec<String> = Vec::new();
1340+
if !origin_ok {
1341+
let detail = last_origin
1342+
.as_ref()
1343+
.map(|o| o.detail())
1344+
.unwrap_or_else(|| "never probed".into());
1345+
parts.push(format!("origin {origin_url} never responded ({detail})"));
1346+
}
1347+
if !proxy_ok {
1348+
let detail = last_proxy
1349+
.as_ref()
1350+
.map(|p| p.detail())
1351+
.unwrap_or_else(|| "never probed".into());
1352+
parts.push(format!(
1353+
"proxy https://{proxy_hostname} never returned non-5xx ({detail})"
1354+
));
1355+
}
1356+
n0_error::bail_any!(
1357+
"connectivity verification timed out after {}s: {}",
1358+
start.elapsed().as_secs(),
1359+
parts.join("; "),
1360+
)
1361+
}

0 commit comments

Comments
 (0)