@@ -131,6 +131,14 @@ const H2_OPEN_TIMEOUT_SECS: u64 = 8;
131131/// long. Prevents every concurrent caller during an h2 outage from
132132/// paying its own full handshake-timeout cost in turn.
133133const H2_OPEN_FAILURE_BACKOFF_SECS : u64 = 15 ;
134+ /// Same idea as `H2_OPEN_TIMEOUT_SECS` but for the legacy h1 socket
135+ /// path. Without this, a stuck TCP connect or TLS handshake to a
136+ /// blackholed `connect_host:443` would block `acquire()` (and the
137+ /// `warm()` prewarm loop) until the outer batch budget elapsed —
138+ /// the same symptom #924 hit during the warm-race window. Bounded
139+ /// here so a single hung handshake aborts fast and the loop / caller
140+ /// makes progress on the next attempt.
141+ const H1_OPEN_TIMEOUT_SECS : u64 = 8 ;
134142/// Cadence for Apps Script container keepalive pings. Apps Script
135143/// containers go cold after ~5min idle and cost 1-3s on the first
136144/// request to wake back up — most painful on YouTube / streaming where
@@ -156,10 +164,23 @@ struct PoolEntry {
156164/// `generation` is monotonic per fronter and lets `poison_h2_if_gen`
157165/// avoid the race where task A's stale failure clears task B's
158166/// freshly-reopened healthy cell.
167+ ///
168+ /// `dead` is set by the spawned connection-driver task when the h2
169+ /// `Connection` future ends (GOAWAY, network error, normal close).
170+ /// Without this, the cell silently held a dead `SendRequest` after a
171+ /// mid-session disconnect — the next request paid a wasted h2 round
172+ /// trip to detect it via `ready()` failure, AND `run_pool_refill`
173+ /// kept maintaining the small `POOL_MIN_H2_FALLBACK` (2-socket) pool
174+ /// instead of expanding to `POOL_MIN` (8). With the flag,
175+ /// `run_pool_refill` notices h2 is dead within one tick (≤5 s) and
176+ /// pre-warms the larger fallback pool before the next request burst,
177+ /// and `ensure_h2` short-circuits the `H2_CONN_TTL_SECS`-based
178+ /// liveness check on a known-dead cell.
159179struct H2Cell {
160180 send : h2:: client:: SendRequest < Bytes > ,
161181 created : Instant ,
162182 generation : u64 ,
183+ dead : Arc < AtomicBool > ,
163184}
164185
165186/// "Did this request reach Apps Script?" signal carried out of every
@@ -864,44 +885,63 @@ impl DomainFronter {
864885 }
865886
866887 async fn open ( & self ) -> Result < PooledStream , FronterError > {
867- let tcp = TcpStream :: connect ( ( self . connect_host . as_str ( ) , 443u16 ) ) . await ?;
868- let _ = tcp. set_nodelay ( true ) ;
869- let sni = self . next_sni ( ) ;
870- let name = ServerName :: try_from ( sni) ?;
871- // Always use the h1-only connector here — the pool only holds
872- // sockets that the raw HTTP/1.1 fallback path can write to.
873- // Using the shared connector would let some pooled sockets
874- // negotiate h2, which would then misframe every fallback
875- // request that lands on them.
876- let tls = self . tls_connector_h1 . connect ( name, tcp) . await ?;
877- Ok ( tls)
888+ // Bounded TCP+TLS open. See `H1_OPEN_TIMEOUT_SECS`.
889+ let work = async {
890+ let tcp = TcpStream :: connect ( ( self . connect_host . as_str ( ) , 443u16 ) ) . await ?;
891+ let _ = tcp. set_nodelay ( true ) ;
892+ let sni = self . next_sni ( ) ;
893+ let name = ServerName :: try_from ( sni) ?;
894+ // Always use the h1-only connector here — the pool only holds
895+ // sockets that the raw HTTP/1.1 fallback path can write to.
896+ // Using the shared connector would let some pooled sockets
897+ // negotiate h2, which would then misframe every fallback
898+ // request that lands on them.
899+ let tls = self . tls_connector_h1 . connect ( name, tcp) . await ?;
900+ Ok :: < _ , FronterError > ( tls)
901+ } ;
902+ match tokio:: time:: timeout ( Duration :: from_secs ( H1_OPEN_TIMEOUT_SECS ) , work) . await {
903+ Ok ( r) => r,
904+ Err ( _) => Err ( FronterError :: Relay ( format ! (
905+ "h1 open timed out after {}s" ,
906+ H1_OPEN_TIMEOUT_SECS
907+ ) ) ) ,
908+ }
878909 }
879910
880911 /// Open outbound TLS connections eagerly so the first relay request
881912 /// doesn't pay a cold handshake.
882913 ///
883- /// When h2 is enabled, attempts to open the multiplexed h2 cell
884- /// first. Success there means one TCP/TLS handshake serves all
885- /// future requests, so we only need a tiny fallback h1 pool
886- /// (clamped to 2) instead of the full `n` requested. On h2 failure
887- /// (ALPN refusal, network error), falls back to the legacy
888- /// behavior: warm the full `n` h1 sockets.
914+ /// h2 and h1 prewarm run in parallel: a request that arrives while
915+ /// the h2 handshake is still in flight (or has just hit its 8 s
916+ /// timeout) needs a warm h1 socket waiting for it, otherwise the
917+ /// h1 fallback path pays a cold handshake on the same slow network
918+ /// and the 30 s outer batch budget elapses (#924). v1.9.14 warmed
919+ /// h1 unconditionally; v1.9.15 (PR #799) accidentally gated the h1
920+ /// prewarm behind `ensure_h2()` so the h1 pool stayed empty during
921+ /// the h2 init window.
889922 ///
890- /// Staggered 500 ms apart so we don't burst N TLS handshakes at the
891- /// Google edge simultaneously, and each connection gets an 8 s
892- /// expiry offset so they roll off gradually instead of all hitting
893- /// POOL_TTL_SECS at once.
923+ /// The spawned h2 handshake races h1[0] — boot fires two TLS
924+ /// handshakes back-to-back. The 500 ms stagger only applies between
925+ /// h1[i] and h1[i+1] for i ≥ 1, so we don't burst the remaining
926+ /// h1[1..n] handshakes at the Google edge simultaneously. Each
927+ /// connection gets an 8 s expiry offset so they roll off gradually
928+ /// instead of all hitting POOL_TTL_SECS at once. If h2 ends up the
929+ /// active fast path, `run_pool_refill` trims the pool back down to
930+ /// `POOL_MIN_H2_FALLBACK` on the next tick — the extra warm h1
931+ /// sockets just age out naturally instead of being kept alive.
894932 pub async fn warm ( self : & Arc < Self > , n : usize ) {
895- // Try to bring up the h2 fast path first. If that succeeds,
896- // shrink the h1 pool warm count to the fallback minimum — the
897- // multiplexed h2 conn handles all real traffic, so the h1 pool
898- // only needs to cover the rare case where h2 dies mid-session.
899- let h2_alive = !self . h2_disabled . load ( Ordering :: Relaxed )
900- && self . ensure_h2 ( ) . await . is_some ( ) ;
901- let h1_target = if h2_alive { 2 . min ( n) } else { n } ;
933+ // Spawn the h2 prewarm in parallel so the h1 prewarm loop
934+ // below isn't blocked on it. Capturing the join handle lets
935+ // us still log "h2 fast path active" / "h1 fallback only"
936+ // accurately at the end.
937+ let h2_self = self . clone ( ) ;
938+ let h2_handle = tokio:: spawn ( async move {
939+ !h2_self. h2_disabled . load ( Ordering :: Relaxed )
940+ && h2_self. ensure_h2 ( ) . await . is_some ( )
941+ } ) ;
902942
903943 let mut warmed = 0usize ;
904- for i in 0 ..h1_target {
944+ for i in 0 ..n {
905945 if i > 0 {
906946 tokio:: time:: sleep ( Duration :: from_millis ( 500 ) ) . await ;
907947 }
@@ -922,6 +962,17 @@ impl DomainFronter {
922962 }
923963 }
924964 }
965+ // Join the h2 prewarm here only to log whether it landed; the
966+ // h1 pool above is already populated either way. A panic in
967+ // the spawned task surfaces as `JoinError` — log it explicitly
968+ // so it isn't indistinguishable from a clean ALPN refusal.
969+ let h2_alive = match h2_handle. await {
970+ Ok ( v) => v,
971+ Err ( e) => {
972+ tracing:: warn!( "h2 prewarm task failed to join: {}" , e) ;
973+ false
974+ }
975+ } ;
925976 if h2_alive {
926977 tracing:: info!(
927978 "h2 fast path active; h1 fallback pool pre-warmed with {} connection(s)" ,
@@ -970,7 +1021,10 @@ impl DomainFronter {
9701021 let cell = self . h2_cell . lock ( ) . await ;
9711022 let h2_alive = cell
9721023 . as_ref ( )
973- . map ( |c| c. created . elapsed ( ) . as_secs ( ) < H2_CONN_TTL_SECS )
1024+ . map ( |c| {
1025+ c. created . elapsed ( ) . as_secs ( ) < H2_CONN_TTL_SECS
1026+ && !c. dead . load ( Ordering :: Relaxed )
1027+ } )
9741028 . unwrap_or ( false ) ;
9751029 if h2_alive { POOL_MIN_H2_FALLBACK } else { POOL_MIN }
9761030 } ;
@@ -1115,16 +1169,18 @@ impl DomainFronter {
11151169 return None ;
11161170 }
11171171
1118- // Fast path: existing cell, within TTL. Clone (Arc bump) and
1119- // return without touching the open machinery . We can't peek at
1120- // SendRequest liveness directly (h2 0.4 doesn't expose
1121- // `is_closed`), so a request against a dead conn fails at
1122- // `ready()`/`send_request` and the caller poisons by
1123- // generation from there .
1172+ // Fast path: existing cell, within TTL and not flagged dead by
1173+ // the connection driver . We can't peek at SendRequest liveness
1174+ // synchronously (h2 0.4 doesn't expose `is_closed`), but the
1175+ // driver task does flip ` dead` when the underlying connection
1176+ // ends — so a known-dead cell is rejected here without paying
1177+ // a wasted h2 round trip to discover it .
11241178 {
11251179 let cell = self . h2_cell . lock ( ) . await ;
11261180 if let Some ( c) = cell. as_ref ( ) {
1127- if c. created . elapsed ( ) . as_secs ( ) < H2_CONN_TTL_SECS {
1181+ if c. created . elapsed ( ) . as_secs ( ) < H2_CONN_TTL_SECS
1182+ && !c. dead . load ( Ordering :: Relaxed )
1183+ {
11281184 return Some ( ( c. send . clone ( ) , c. generation ) ) ;
11291185 }
11301186 }
@@ -1155,7 +1211,9 @@ impl DomainFronter {
11551211 {
11561212 let cell = self . h2_cell . lock ( ) . await ;
11571213 if let Some ( c) = cell. as_ref ( ) {
1158- if c. created . elapsed ( ) . as_secs ( ) < H2_CONN_TTL_SECS {
1214+ if c. created . elapsed ( ) . as_secs ( ) < H2_CONN_TTL_SECS
1215+ && !c. dead . load ( Ordering :: Relaxed )
1216+ {
11591217 return Some ( ( c. send . clone ( ) , c. generation ) ) ;
11601218 }
11611219 }
@@ -1168,8 +1226,8 @@ impl DomainFronter {
11681226 tokio:: time:: timeout ( Duration :: from_secs ( H2_OPEN_TIMEOUT_SECS ) , self . open_h2 ( ) )
11691227 . await ;
11701228
1171- let send = match open_result {
1172- Ok ( Ok ( s ) ) => s ,
1229+ let ( send, dead ) = match open_result {
1230+ Ok ( Ok ( pair ) ) => pair ,
11731231 Ok ( Err ( OpenH2Error :: AlpnRefused ) ) => {
11741232 // Definitive: this peer doesn't speak h2. Sticky-disable
11751233 // so we never re-attempt the handshake.
@@ -1206,14 +1264,19 @@ impl DomainFronter {
12061264 send : send. clone ( ) ,
12071265 created : Instant :: now ( ) ,
12081266 generation,
1267+ dead,
12091268 } ) ;
12101269 Some ( ( send, generation) )
12111270 }
12121271
12131272 /// Open one TLS connection and run the h2 handshake. Returns a
12141273 /// typed `OpenH2Error` so the caller can recognize ALPN refusal
12151274 /// (sticky disable) without string-matching across boundaries.
1216- async fn open_h2 ( & self ) -> Result < h2:: client:: SendRequest < Bytes > , OpenH2Error > {
1275+ /// The returned `Arc<AtomicBool>` is the death flag the connection
1276+ /// driver flips when the h2 `Connection` future ends.
1277+ async fn open_h2 (
1278+ & self ,
1279+ ) -> Result < ( h2:: client:: SendRequest < Bytes > , Arc < AtomicBool > ) , OpenH2Error > {
12171280 let tcp = TcpStream :: connect ( ( self . connect_host . as_str ( ) , 443u16 ) ) . await ?;
12181281 let _ = tcp. set_nodelay ( true ) ;
12191282 let sni = self . next_sni ( ) ;
@@ -1228,7 +1291,7 @@ impl DomainFronter {
12281291 /// bypassing the hard-coded `connect_host:443` target.
12291292 async fn h2_handshake_post_tls (
12301293 tls : PooledStream ,
1231- ) -> Result < h2:: client:: SendRequest < Bytes > , OpenH2Error > {
1294+ ) -> Result < ( h2:: client:: SendRequest < Bytes > , Arc < AtomicBool > ) , OpenH2Error > {
12321295 let alpn_h2 = tls
12331296 . get_ref ( )
12341297 . 1
@@ -1251,15 +1314,19 @@ impl DomainFronter {
12511314 . map_err ( |e| OpenH2Error :: Handshake ( e. to_string ( ) ) ) ?;
12521315 // The connection task drives frame I/O independently of any
12531316 // SendRequest handle. When it ends (GOAWAY, network error, TTL),
1254- // existing handles will start failing on `ready()` / `send_request`
1255- // and `ensure_h2` will reopen on the next call.
1317+ // we flip the `dead` flag so `ensure_h2` and `run_pool_refill`
1318+ // can react within one refill tick instead of waiting for a
1319+ // request to discover the breakage via `ready()` failure.
1320+ let dead = Arc :: new ( AtomicBool :: new ( false ) ) ;
1321+ let dead_for_driver = dead. clone ( ) ;
12561322 tokio:: spawn ( async move {
12571323 if let Err ( e) = conn. await {
12581324 tracing:: debug!( "h2 connection closed: {}" , e) ;
12591325 }
1326+ dead_for_driver. store ( true , Ordering :: Relaxed ) ;
12601327 } ) ;
12611328 tracing:: info!( "h2 connection established to relay edge" ) ;
1262- Ok ( send)
1329+ Ok ( ( send, dead ) )
12631330 }
12641331
12651332 /// React to an h2-fronting-incompatibility HTTP response (status
@@ -5120,6 +5187,7 @@ hello";
51205187 send : send_v2. clone ( ) ,
51215188 created : Instant :: now ( ) ,
51225189 generation : 2 ,
5190+ dead : Arc :: new ( AtomicBool :: new ( false ) ) ,
51235191 } ) ;
51245192 }
51255193 // Task A poisons with stale gen=1.
@@ -5141,6 +5209,52 @@ hello";
51415209 server_handle. abort ( ) ;
51425210 }
51435211
5212+ #[ tokio:: test( flavor = "current_thread" ) ]
5213+ async fn ensure_h2_rejects_dead_cell_within_ttl ( ) {
5214+ // Cell is within H2_CONN_TTL_SECS but the connection driver
5215+ // already flipped `dead` (e.g., upstream sent GOAWAY). Without
5216+ // the dead-flag check `ensure_h2` would happily hand out the
5217+ // stale SendRequest and the next request would pay a wasted
5218+ // h2 round trip to discover the breakage. With the check in
5219+ // place a second pre-existing healthy cell still works fine —
5220+ // the dead one is replaced via the open-lock path.
5221+ let ( addr, server_handle) = spawn_h2c_server ( |_req| {
5222+ let resp = http:: Response :: builder ( ) . status ( 200 ) . body ( ( ) ) . unwrap ( ) ;
5223+ ( resp, Vec :: new ( ) )
5224+ } )
5225+ . await ;
5226+ let send = h2c_client ( addr) . await ;
5227+
5228+ let fronter = fronter_for_test ( false ) ;
5229+ let dead = Arc :: new ( AtomicBool :: new ( true ) ) ; // simulate driver having exited
5230+ {
5231+ let mut cell = fronter. h2_cell . lock ( ) . await ;
5232+ * cell = Some ( H2Cell {
5233+ send,
5234+ created : Instant :: now ( ) , // well within TTL
5235+ generation : 1 ,
5236+ dead : dead. clone ( ) ,
5237+ } ) ;
5238+ }
5239+
5240+ // The fast path normally returns Some(send, gen) when the cell
5241+ // is within TTL. With dead=true it must NOT return the stale
5242+ // SendRequest. Pre-set the failure-backoff timestamp so
5243+ // ensure_h2 short-circuits at the backoff check (no network
5244+ // I/O) regardless of whatever's bound on 127.0.0.1:443 on the
5245+ // dev/CI host. This isolates the assertion to the new
5246+ // dead-flag check.
5247+ * fronter. h2_open_failed_at . lock ( ) . await = Some ( Instant :: now ( ) ) ;
5248+
5249+ let result = fronter. ensure_h2 ( ) . await ;
5250+ assert ! (
5251+ result. is_none( ) ,
5252+ "ensure_h2 must not serve a cell whose driver flipped `dead`"
5253+ ) ;
5254+
5255+ server_handle. abort ( ) ;
5256+ }
5257+
51445258 #[ tokio:: test( flavor = "current_thread" ) ]
51455259 async fn ensure_h2_skips_reopen_during_failure_backoff ( ) {
51465260 // After an open failure, ensure_h2 must return None for at
@@ -5566,6 +5680,7 @@ hello";
55665680 send : send. clone ( ) ,
55675681 created : Instant :: now ( ) ,
55685682 generation : 7 ,
5683+ dead : Arc :: new ( AtomicBool :: new ( false ) ) ,
55695684 } ) ;
55705685 }
55715686 // Pretend a round-trip just incremented h2_calls (which is
@@ -5682,7 +5797,7 @@ hello";
56825797 match result {
56835798 Err ( OpenH2Error :: AlpnRefused ) => { } // expected
56845799 Err ( other) => panic ! ( "expected AlpnRefused, got {:?}" , other) ,
5685- Ok ( _ ) => panic ! ( "expected AlpnRefused, got Ok" ) ,
5800+ Ok ( ( _send , _dead ) ) => panic ! ( "expected AlpnRefused, got Ok" ) ,
56865801 }
56875802 server. await . unwrap ( ) ;
56885803 }
0 commit comments