|
| 1 | +//! Regression test for the v7 LIST-failure fix. |
| 2 | +//! |
| 3 | +//! Background: after the v1→v7 migration of a large bucket, LIST walks the |
| 4 | +//! sharded HAMT via `list_all_files`, which fans out to many concurrent |
| 5 | +//! `S3BlobBackend::get` calls on `/bucket/__fula_forest_v7_nodes/<key>`. When |
| 6 | +//! the gateway is fronted by an nginx `limit_req` with a `burst` smaller than |
| 7 | +//! the in-flight count, some requests come back as HTTP 503 with an empty |
| 8 | +//! body — a single one of those aborted the whole LIST. |
| 9 | +//! |
| 10 | +//! The fix adds bounded fixed-delay retries (300 ms base + 0-100 ms jitter, |
| 11 | +//! up to 4 attempts) in `S3BlobBackend::{get, put}` for the transient class |
| 12 | +//! HTTP 429/500/502/503/504 / `SlowDown` / `InternalError` / |
| 13 | +//! `ServiceUnavailable`. This test exercises that retry path end-to-end |
| 14 | +//! against a `wiremock::MockServer` that emits 503 → 503 → 200 and asserts: |
| 15 | +//! |
| 16 | +//! 1. The outer call returns `Ok`. |
| 17 | +//! 2. `blob_backend_retry_count()` observes exactly the number of retry |
| 18 | +//! sleeps we expect. |
| 19 | +//! 3. Non-transient responses (e.g. 404) are NOT retried. |
| 20 | +
|
| 21 | +#![cfg(not(target_arch = "wasm32"))] |
| 22 | + |
| 23 | +use fula_client::{Config, FulaClient, S3BlobBackend, blob_backend_retry_count}; |
| 24 | +use fula_crypto::BlobBackend; |
| 25 | +use std::sync::Arc; |
| 26 | +use std::sync::atomic::{AtomicUsize, Ordering}; |
| 27 | +use wiremock::matchers::{method, path}; |
| 28 | +use wiremock::{Mock, MockServer, Request, Respond, ResponseTemplate}; |
| 29 | + |
| 30 | +/// The retry counter is process-wide; tests in this file all mutate it. |
| 31 | +/// Serialize counter-sensitive tests under a shared lock so the `before → |
| 32 | +/// after` delta measured by each test is actually its own increments. |
| 33 | +static COUNTER_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); |
| 34 | + |
| 35 | +/// Responder that emits the Nth response from a list; cycles if called more |
| 36 | +/// than `responses.len()` times. Lets us script "503, 503, 200" against a |
| 37 | +/// single path without wiremock's expectation counters fighting us. |
| 38 | +struct Scripted { |
| 39 | + calls: Arc<AtomicUsize>, |
| 40 | + responses: Vec<ResponseTemplate>, |
| 41 | +} |
| 42 | + |
| 43 | +impl Respond for Scripted { |
| 44 | + fn respond(&self, _req: &Request) -> ResponseTemplate { |
| 45 | + let idx = self.calls.fetch_add(1, Ordering::SeqCst); |
| 46 | + let slot = idx.min(self.responses.len() - 1); |
| 47 | + self.responses[slot].clone() |
| 48 | + } |
| 49 | +} |
| 50 | + |
| 51 | +fn mk_client(endpoint: &str) -> FulaClient { |
| 52 | + // 10 s per-request timeout is ample for mock responses; the test doesn't |
| 53 | + // need a real connect timeout. |
| 54 | + let cfg = Config::new(endpoint); |
| 55 | + FulaClient::new(cfg).expect("build FulaClient") |
| 56 | +} |
| 57 | + |
| 58 | +#[tokio::test] |
| 59 | +async fn get_retries_through_two_503s_then_succeeds() { |
| 60 | + let _guard = COUNTER_LOCK.lock().unwrap_or_else(|p| p.into_inner()); |
| 61 | + let server = MockServer::start().await; |
| 62 | + |
| 63 | + let calls = Arc::new(AtomicUsize::new(0)); |
| 64 | + let responder = Scripted { |
| 65 | + calls: calls.clone(), |
| 66 | + responses: vec![ |
| 67 | + ResponseTemplate::new(503), |
| 68 | + ResponseTemplate::new(503), |
| 69 | + ResponseTemplate::new(200).set_body_bytes(b"hamt-node-bytes".to_vec()), |
| 70 | + ], |
| 71 | + }; |
| 72 | + Mock::given(method("GET")) |
| 73 | + .and(path("/images/__fula_forest_v7_nodes/deadbeef")) |
| 74 | + .respond_with(responder) |
| 75 | + .mount(&server) |
| 76 | + .await; |
| 77 | + |
| 78 | + let before = blob_backend_retry_count(); |
| 79 | + let client = mk_client(&server.uri()); |
| 80 | + let backend = S3BlobBackend::new(client, "images".to_string()); |
| 81 | + |
| 82 | + let got = backend |
| 83 | + .get("__fula_forest_v7_nodes/deadbeef") |
| 84 | + .await |
| 85 | + .expect("retry should absorb two 503s"); |
| 86 | + |
| 87 | + assert_eq!(got, b"hamt-node-bytes"); |
| 88 | + assert_eq!( |
| 89 | + calls.load(Ordering::SeqCst), |
| 90 | + 3, |
| 91 | + "mock should have been hit once per attempt" |
| 92 | + ); |
| 93 | + let retries = blob_backend_retry_count() - before; |
| 94 | + assert_eq!(retries, 2, "two 503s → two retry sleeps, then success"); |
| 95 | +} |
| 96 | + |
| 97 | +#[tokio::test] |
| 98 | +async fn put_retries_through_one_503_then_succeeds() { |
| 99 | + let _guard = COUNTER_LOCK.lock().unwrap_or_else(|p| p.into_inner()); |
| 100 | + let server = MockServer::start().await; |
| 101 | + |
| 102 | + let calls = Arc::new(AtomicUsize::new(0)); |
| 103 | + let responder = Scripted { |
| 104 | + calls: calls.clone(), |
| 105 | + responses: vec![ |
| 106 | + ResponseTemplate::new(503), |
| 107 | + ResponseTemplate::new(200).insert_header("ETag", "\"abc123\""), |
| 108 | + ], |
| 109 | + }; |
| 110 | + Mock::given(method("PUT")) |
| 111 | + .and(path("/images/__fula_forest_v7_nodes/cafebabe")) |
| 112 | + .respond_with(responder) |
| 113 | + .mount(&server) |
| 114 | + .await; |
| 115 | + |
| 116 | + let before = blob_backend_retry_count(); |
| 117 | + let client = mk_client(&server.uri()); |
| 118 | + let backend = S3BlobBackend::new(client, "images".to_string()); |
| 119 | + |
| 120 | + backend |
| 121 | + .put("__fula_forest_v7_nodes/cafebabe", b"encrypted-node-blob".to_vec()) |
| 122 | + .await |
| 123 | + .expect("retry should absorb the 503"); |
| 124 | + |
| 125 | + assert_eq!(calls.load(Ordering::SeqCst), 2); |
| 126 | + let retries = blob_backend_retry_count() - before; |
| 127 | + assert_eq!(retries, 1, "one 503 → one retry sleep"); |
| 128 | +} |
| 129 | + |
| 130 | +#[tokio::test] |
| 131 | +async fn get_gives_up_after_max_attempts_on_persistent_503() { |
| 132 | + let _guard = COUNTER_LOCK.lock().unwrap_or_else(|p| p.into_inner()); |
| 133 | + let server = MockServer::start().await; |
| 134 | + |
| 135 | + let calls = Arc::new(AtomicUsize::new(0)); |
| 136 | + let responder = Scripted { |
| 137 | + calls: calls.clone(), |
| 138 | + // Always 503 — more entries than attempts so Scripted never wraps. |
| 139 | + responses: vec![ |
| 140 | + ResponseTemplate::new(503), |
| 141 | + ResponseTemplate::new(503), |
| 142 | + ResponseTemplate::new(503), |
| 143 | + ResponseTemplate::new(503), |
| 144 | + ResponseTemplate::new(503), |
| 145 | + ], |
| 146 | + }; |
| 147 | + Mock::given(method("GET")) |
| 148 | + .and(path("/images/__fula_forest_v7_nodes/persistently-broken")) |
| 149 | + .respond_with(responder) |
| 150 | + .mount(&server) |
| 151 | + .await; |
| 152 | + |
| 153 | + let client = mk_client(&server.uri()); |
| 154 | + let backend = S3BlobBackend::new(client, "images".to_string()); |
| 155 | + |
| 156 | + let err = backend |
| 157 | + .get("__fula_forest_v7_nodes/persistently-broken") |
| 158 | + .await |
| 159 | + .expect_err("persistent 503 must eventually surface"); |
| 160 | + |
| 161 | + // Four total attempts = `BLOB_BACKEND_MAX_ATTEMPTS`. |
| 162 | + assert_eq!(calls.load(Ordering::SeqCst), 4, "attempts capped at 4"); |
| 163 | + // Error message should mention the storage-backend failure; we don't |
| 164 | + // pin the exact string (ClientError::to_string is wrapped via |
| 165 | + // client_err_to_crypto → CryptoError::Storage). |
| 166 | + let msg = err.to_string(); |
| 167 | + assert!( |
| 168 | + msg.to_lowercase().contains("storage") || msg.contains("503"), |
| 169 | + "unexpected error message: {msg}" |
| 170 | + ); |
| 171 | +} |
| 172 | + |
| 173 | +#[tokio::test] |
| 174 | +async fn get_does_not_retry_on_404() { |
| 175 | + let _guard = COUNTER_LOCK.lock().unwrap_or_else(|p| p.into_inner()); |
| 176 | + let server = MockServer::start().await; |
| 177 | + |
| 178 | + let calls = Arc::new(AtomicUsize::new(0)); |
| 179 | + let responder = Scripted { |
| 180 | + calls: calls.clone(), |
| 181 | + responses: vec![ResponseTemplate::new(404)], |
| 182 | + }; |
| 183 | + Mock::given(method("GET")) |
| 184 | + .and(path("/images/__fula_forest_v7_nodes/not-there")) |
| 185 | + .respond_with(responder) |
| 186 | + .mount(&server) |
| 187 | + .await; |
| 188 | + |
| 189 | + let before = blob_backend_retry_count(); |
| 190 | + let client = mk_client(&server.uri()); |
| 191 | + let backend = S3BlobBackend::new(client, "images".to_string()); |
| 192 | + |
| 193 | + let _err = backend |
| 194 | + .get("__fula_forest_v7_nodes/not-there") |
| 195 | + .await |
| 196 | + .expect_err("404 must not be retried"); |
| 197 | + |
| 198 | + assert_eq!(calls.load(Ordering::SeqCst), 1, "404 is terminal"); |
| 199 | + assert_eq!( |
| 200 | + blob_backend_retry_count(), |
| 201 | + before, |
| 202 | + "non-transient error must not bump retry counter" |
| 203 | + ); |
| 204 | +} |
0 commit comments