Skip to content

Commit d8f7c96

Browse files
committed
feat: retry quota-check timeouts on tunnel resource creates
The operator's quota service occasionally times out the admission check on Create requests and returns 403 with: "Your request took too long to be checked against your quota. Please try again in a moment — if this keeps happening, contact support." The error message itself says "try again". Until today the CLI just surfaced the raw 403 and bailed mid-listen, which the user has to recover from manually. Add is_quota_check_timeout() that matches the specific 403 message (distinct from real quota exhaustion, which uses different wording), and with_quota_check_retry() that retries up to ~15s (1s, 2s, 4s, 8s, final attempt) on that exact class. Other 403s — real exhaustion, IAM denials, admission rejections — return immediately so genuine failures still surface fast. Prints a one-line stderr notice on first retry so the user knows we're waiting on the server, not wedged. Apply at every kube .create() site in the tunnel lifecycle: - HTTPProxy create (fresh tunnel) - ConnectorAdvertisement create (fresh tunnel) - TrafficProtectionPolicy create (fresh tunnel) - ConnectorAdvertisement create (set_enabled when resuming) - Connector create (ensure_connector first run) Also tighten format_quota_error to skip the timeout phrase: when retries exhaust, the user should see the actual server message rather than "Quota limit exceeded for ConnectorAdvertisement", which is the wrong diagnosis. Real "Insufficient quota" exhaustion still gets the helpful message. Test covers the classifier and the formatter carve-out on both the timeout and real-exhaustion shapes.
1 parent cff37e7 commit d8f7c96

1 file changed

Lines changed: 146 additions & 37 deletions

File tree

lib/src/tunnels.rs

Lines changed: 146 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -586,21 +586,22 @@ impl TunnelService {
586586
},
587587
status: None,
588588
};
589-
proxy = proxies
590-
.create(&PostParams::default(), &proxy)
591-
.await
592-
.map_err(|err| {
593-
warn!(
594-
%project_id,
595-
connector = %connector_name,
596-
endpoint = %endpoint,
597-
"HTTPProxy create failed: {err:#}"
598-
);
599-
format_quota_error(&err, "HTTPProxy").unwrap_or_else(|| {
600-
format!("Failed to create HTTPProxy: {err}")
601-
})
602-
})
603-
.map_err(|err| n0_error::anyerr!(err))?;
589+
let post_params = PostParams::default();
590+
proxy = with_quota_check_retry("HTTPProxy create", || {
591+
proxies.create(&post_params, &proxy)
592+
})
593+
.await
594+
.map_err(|err| {
595+
warn!(
596+
%project_id,
597+
connector = %connector_name,
598+
endpoint = %endpoint,
599+
"HTTPProxy create failed: {err:#}"
600+
);
601+
format_quota_error(&err, "HTTPProxy")
602+
.unwrap_or_else(|| format!("Failed to create HTTPProxy: {err}"))
603+
})
604+
.map_err(|err| n0_error::anyerr!(err))?;
604605
let proxy_name = proxy.name_any();
605606
debug!(
606607
%project_id,
@@ -624,20 +625,22 @@ impl TunnelService {
624625
spec: ad_spec,
625626
status: None,
626627
};
627-
ads.create(&PostParams::default(), &ad)
628-
.await
629-
.map_err(|err| {
630-
warn!(
631-
%project_id,
632-
proxy = %proxy_name,
633-
connector = %connector_name,
634-
"ConnectorAdvertisement create failed: {err:#}"
635-
);
636-
format_quota_error(&err, "ConnectorAdvertisement").unwrap_or_else(|| {
637-
format!("Failed to create ConnectorAdvertisement: {err}")
638-
})
639-
})
640-
.map_err(|err| n0_error::anyerr!(err))?;
628+
let ad_post = PostParams::default();
629+
with_quota_check_retry("ConnectorAdvertisement create", || {
630+
ads.create(&ad_post, &ad)
631+
})
632+
.await
633+
.map_err(|err| {
634+
warn!(
635+
%project_id,
636+
proxy = %proxy_name,
637+
connector = %connector_name,
638+
"ConnectorAdvertisement create failed: {err:#}"
639+
);
640+
format_quota_error(&err, "ConnectorAdvertisement")
641+
.unwrap_or_else(|| format!("Failed to create ConnectorAdvertisement: {err}"))
642+
})
643+
.map_err(|err| n0_error::anyerr!(err))?;
641644
debug!(
642645
%project_id,
643646
proxy = %proxy_name,
@@ -681,7 +684,10 @@ impl TunnelService {
681684
},
682685
status: None,
683686
};
684-
tpps.create(&PostParams::default(), &tpp)
687+
let tpp_post = PostParams::default();
688+
with_quota_check_retry("TrafficProtectionPolicy create", || {
689+
tpps.create(&tpp_post, &tpp)
690+
})
685691
.await
686692
.map_err(|err| {
687693
warn!(
@@ -883,9 +889,12 @@ impl TunnelService {
883889
spec: ad_spec,
884890
status: None,
885891
};
886-
ads.create(&PostParams::default(), &ad)
887-
.await
888-
.std_context("Failed to create ConnectorAdvertisement")?;
892+
let ad_post = PostParams::default();
893+
with_quota_check_retry("ConnectorAdvertisement create", || {
894+
ads.create(&ad_post, &ad)
895+
})
896+
.await
897+
.std_context("Failed to create ConnectorAdvertisement")?;
889898
}
890899
}
891900
} else if ads
@@ -1157,10 +1166,12 @@ impl TunnelService {
11571166
},
11581167
status: None,
11591168
};
1160-
connector = connectors
1161-
.create(&PostParams::default(), &connector)
1162-
.await
1163-
.std_context("Failed to create Connector")?;
1169+
let conn_post = PostParams::default();
1170+
connector = with_quota_check_retry("Connector create", || {
1171+
connectors.create(&conn_post, &connector)
1172+
})
1173+
.await
1174+
.std_context("Failed to create Connector")?;
11641175

11651176
if let Some(details) = build_connection_details(&self.listen) {
11661177
let details_value = serde_json::to_value(details)
@@ -1477,6 +1488,13 @@ async fn patch_device_annotations(api: &Api<Connector>, connector: &mut Connecto
14771488

14781489
fn format_quota_error(err: &dyn std::error::Error, resource_type: &str) -> Option<String> {
14791490
let err_msg = err.to_string();
1491+
// Transient quota-check timeout — the error literally says "Please try
1492+
// again in a moment". Don't relabel it as "exceeded"; with the retry
1493+
// wrapper applied at creation sites we'll usually never get here, and
1494+
// when we do the original message is the most accurate signal.
1495+
if err_msg.contains("took too long to be checked against your quota") {
1496+
return None;
1497+
}
14801498
if err_msg.contains("quota") || err_msg.contains("Insufficient quota") {
14811499
return Some(format!(
14821500
"Quota limit exceeded for {resource_type} resources.\n\n\
@@ -1490,6 +1508,57 @@ fn format_quota_error(err: &dyn std::error::Error, resource_type: &str) -> Optio
14901508
None
14911509
}
14921510

1511+
/// True if `err` is the operator's transient quota-check timeout (a 403
1512+
/// whose message says "Please try again in a moment"). Distinct from
1513+
/// real quota exhaustion, which produces a different message and
1514+
/// shouldn't be retried.
1515+
fn is_quota_check_timeout(err: &kube::Error) -> bool {
1516+
matches!(
1517+
err,
1518+
kube::Error::Api(e)
1519+
if e.code == 403
1520+
&& e.message.contains("took too long to be checked against your quota")
1521+
)
1522+
}
1523+
1524+
/// Retry a kube API call up to ~15 seconds while it keeps tripping the
1525+
/// operator's quota-check timeout. Other errors return immediately so
1526+
/// real failures still surface fast. Prints a one-line stderr notice on
1527+
/// the first retry so the user knows we're waiting on the server.
1528+
async fn with_quota_check_retry<T, F, Fut>(op_name: &str, mut f: F) -> kube::Result<T>
1529+
where
1530+
F: FnMut() -> Fut,
1531+
Fut: std::future::Future<Output = kube::Result<T>>,
1532+
{
1533+
let delays = [
1534+
std::time::Duration::from_secs(1),
1535+
std::time::Duration::from_secs(2),
1536+
std::time::Duration::from_secs(4),
1537+
std::time::Duration::from_secs(8),
1538+
];
1539+
for (i, delay) in delays.iter().enumerate() {
1540+
match f().await {
1541+
Ok(v) => return Ok(v),
1542+
Err(err) if is_quota_check_timeout(&err) => {
1543+
if i == 0 {
1544+
eprintln!(
1545+
" … quota check timed out for {op_name}; retrying for up to 15s"
1546+
);
1547+
}
1548+
warn!(
1549+
op = op_name,
1550+
attempt = i + 1,
1551+
next_delay_s = delay.as_secs(),
1552+
"quota check timed out; retrying"
1553+
);
1554+
tokio::time::sleep(*delay).await;
1555+
}
1556+
Err(err) => return Err(err),
1557+
}
1558+
}
1559+
f().await
1560+
}
1561+
14931562
fn publish_tickets_enabled() -> bool {
14941563
std::env::var("DATUM_CONNECT_PUBLISH_TICKETS")
14951564
.map(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES"))
@@ -1678,6 +1747,46 @@ mod tests {
16781747
);
16791748
}
16801749

1750+
fn api_error(code: u16, message: &str) -> kube::Error {
1751+
kube::Error::Api(kube::core::ErrorResponse {
1752+
status: "Failure".into(),
1753+
message: message.into(),
1754+
reason: if code == 403 { "Forbidden".into() } else { "Unknown".into() },
1755+
code,
1756+
})
1757+
}
1758+
1759+
#[test]
1760+
fn quota_check_timeout_classifier_matches_transient_403() {
1761+
// The exact phrase the operator emits when the quota check itself
1762+
// times out — distinct from real quota exhaustion. The error message
1763+
// literally says "Please try again in a moment".
1764+
let err = api_error(
1765+
403,
1766+
"connectoradvertisements.networking.datumapis.com \"tunnel-x\" is forbidden: \
1767+
Your request took too long to be checked against your quota. Please try again \
1768+
in a moment — if this keeps happening, contact support.",
1769+
);
1770+
assert!(is_quota_check_timeout(&err));
1771+
1772+
// Real exhaustion shouldn't trigger retry.
1773+
let exhausted = api_error(403, "Insufficient quota for ConnectorAdvertisement");
1774+
assert!(!is_quota_check_timeout(&exhausted));
1775+
1776+
// 401 with similar text shouldn't match — different failure class.
1777+
let unauthorized = api_error(401, "took too long to be checked against your quota");
1778+
assert!(!is_quota_check_timeout(&unauthorized));
1779+
1780+
// format_quota_error should NOT mangle the timeout message into a
1781+
// misleading "Quota limit exceeded" string.
1782+
assert!(
1783+
format_quota_error(&err, "ConnectorAdvertisement").is_none(),
1784+
"transient timeout must propagate verbatim, not become 'exceeded'"
1785+
);
1786+
// It SHOULD format real exhaustion.
1787+
assert!(format_quota_error(&exhausted, "ConnectorAdvertisement").is_some());
1788+
}
1789+
16811790
#[test]
16821791
fn progress_pending_when_status_is_stale_for_current_generation() {
16831792
// `tunnel listen --id` PATCHes the HTTPProxy spec to re-point the

0 commit comments

Comments
 (0)