fix: WebSocket passthrough and stream timeout decoupling

CaptainMirage · maybeknott · commit 87594b1abb1e · 2026-05-24T13:52:22.000+03:30
Inside the MITM TLS session, detect Connection: Upgrade + Upgrade:
websocket before hitting the Apps Script relay. Establish a direct TLS
connection to the real server (via upstream_socks5 if configured),
relay the upgrade handshake, then splice both directions with
copy_bidirectional. Apps Script cannot hold persistent connections so
the bypass is the only viable path for wss://.

Split request_timeout_secs (header/connect, 30s) from a new
stream_timeout_secs (per-chunk body idle, default 300s) so large range
downloads through Apps Script are not killed mid-transfer by the
batch_timeout firing during the body drain phase.
diff --git a/src/config.rs b/src/config.rs
@@ -376,9 +376,22 @@ pub struct Config {
     /// retry sooner when a deployment hangs. Floor `5`, ceiling `300`
     /// (anything beyond exceeds Apps Script's hard 6-min cap with
     /// no benefit).
+    ///
+    /// This applies to connection establishment and response header
+    /// arrival only. Body streaming is governed by `stream_timeout_secs`.
     #[serde(default = "default_request_timeout_secs")]
     pub request_timeout_secs: u64,
 
+    /// Per-chunk body streaming idle timeout (seconds). Default `300`.
+    /// Applies to each individual body chunk read after headers arrive —
+    /// a chunk that goes silent for longer than this is considered a
+    /// stalled connection and the request is aborted. Distinct from
+    /// `request_timeout_secs` so large responses through Apps Script
+    /// (where each 256 KB range chunk can take 30-90s) are not killed
+    /// mid-transfer. Floor `10`, ceiling `3600`.
+    #[serde(default = "default_stream_timeout_secs")]
+    pub stream_timeout_secs: u64,
+
     /// Optional second-hop exit node, for sites that block traffic
     /// from Google datacenter IPs (Apps Script's outbound IP space).
     /// Most visibly: Cloudflare-fronted services that flag the GCP IP
@@ -531,6 +544,10 @@ fn default_auto_blacklist_cooldown_secs() -> u64 { 120 }
 /// hard-coded `BATCH_TIMEOUT` and Apps Script's typical response cliff.
 fn default_request_timeout_secs() -> u64 { 30 }
 
+/// Default for `stream_timeout_secs`: 300s per-chunk idle timeout for
+/// body streaming, separate from the header/connect timeout.
+fn default_stream_timeout_secs() -> u64 { 300 }
+
 fn default_google_ip() -> String {
     "216.239.38.120".into()
 }
@@ -766,6 +783,8 @@ pub struct TomlRelay {
     pub auto_blacklist_cooldown_secs: u64,
     #[serde(default = "default_request_timeout_secs")]
     pub request_timeout_secs: u64,
+    #[serde(default = "default_stream_timeout_secs")]
+    pub stream_timeout_secs: u64,
 }
 
 /// [network] section of config.toml.
@@ -919,6 +938,7 @@ impl From<TomlConfig> for Config {
             auto_blacklist_window_secs: t.relay.auto_blacklist_window_secs,
             auto_blacklist_cooldown_secs: t.relay.auto_blacklist_cooldown_secs,
             request_timeout_secs: t.relay.request_timeout_secs,
+            stream_timeout_secs: t.relay.stream_timeout_secs,
             exit_node: t.exit_node,
         }
     }
@@ -946,6 +966,7 @@ impl From<&Config> for TomlConfig {
                 auto_blacklist_window_secs: c.auto_blacklist_window_secs,
                 auto_blacklist_cooldown_secs: c.auto_blacklist_cooldown_secs,
                 request_timeout_secs: c.request_timeout_secs,
+                stream_timeout_secs: c.stream_timeout_secs,
             },
             network: TomlNetwork {
                 google_ip: c.google_ip.clone(),
diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs
@@ -423,7 +423,12 @@ pub struct DomainFronter {
     /// Per-batch HTTP timeout. Mirrors `Config::request_timeout_secs`
     /// (#430, masterking32 PR #25). Read by `tunnel_client::fire_batch`
     /// so a single config field tunes the timeout used everywhere.
+    /// Applies to connection establishment and response header arrival only.
     batch_timeout: Duration,
+    /// Per-chunk body streaming idle timeout. Mirrors `Config::stream_timeout_secs`.
+    /// Applied per-iteration of the body drain loop so large responses
+    /// through Apps Script are not killed mid-transfer by `batch_timeout`.
+    stream_timeout: Duration,
     /// Optional second-hop exit node (Deno Deploy / fly.io / etc.)
     /// to bypass CF-anti-bot blocks on sites that flag Google datacenter
     /// IPs (chatgpt.com, claude.ai, grok.com, x.com). Mirrors
@@ -642,6 +647,9 @@ impl DomainFronter {
             batch_timeout: Duration::from_secs(
                 config.request_timeout_secs.clamp(5, 300),
             ),
+            stream_timeout: Duration::from_secs(
+                config.stream_timeout_secs.clamp(10, 3600),
+            ),
             exit_node_enabled: config.exit_node.enabled
                 && !config.exit_node.relay_url.is_empty()
                 && !config.exit_node.psk.is_empty(),
@@ -697,6 +705,11 @@ impl DomainFronter {
         self.batch_timeout
     }
 
+    /// Per-chunk body streaming idle timeout. Clamped to `[10s, 3600s]`.
+    pub(crate) fn stream_timeout(&self) -> Duration {
+        self.stream_timeout
+    }
+
     /// Record one relay call toward the daily budget. Called once per
     /// outbound Apps Script fetch. Rolls over both daily counters at
     /// 00:00 Pacific Time, matching Apps Script's quota reset cadence
@@ -1533,18 +1546,17 @@ impl DomainFronter {
             })?;
         }
 
-        // Phase 2: response headers + body drain. Bounded by the
-        // caller's deadline. Errors and timeout here are
-        // `RequestSent::Maybe` — the request is on the wire and may
-        // already have side effects.
-        let response_phase = async {
+        // Phase 2a: wait for response headers. Bounded by the caller's
+        // deadline (`batch_timeout` / `request_timeout_secs`). A timeout
+        // here means the relay never responded — safe to retry.
+        let header_phase = async {
             let response = response_fut.await.map_err(|e| {
                 (
                     FronterError::Relay(format!("h2 response: {}", e)),
                     RequestSent::Maybe,
                 )
             })?;
-            let (parts, mut body) = response.into_parts();
+            let (parts, body) = response.into_parts();
             let status = parts.status.as_u16();
 
             // Convert headers to the (String, String) Vec the rest of
@@ -1557,27 +1569,12 @@ impl DomainFronter {
                     headers.push((name.as_str().to_string(), v.to_string()));
                 }
             }
-
-            // Drain body. Release flow-control credit per chunk so
-            // large responses don't stall after the initial 4 MB window.
-            let mut buf: Vec<u8> = Vec::new();
-            while let Some(chunk) = body.data().await {
-                let chunk = chunk.map_err(|e| {
-                    (
-                        FronterError::Relay(format!("h2 body chunk: {}", e)),
-                        RequestSent::Maybe,
-                    )
-                })?;
-                let n = chunk.len();
-                buf.extend_from_slice(&chunk);
-                let _ = body.flow_control().release_capacity(n);
-            }
-            Ok::<_, (FronterError, RequestSent)>((status, headers, buf))
+            Ok::<_, (FronterError, RequestSent)>((status, headers, body))
         };
 
-        let (status, headers, mut buf) = match tokio::time::timeout(
+        let (status, headers, mut body) = match tokio::time::timeout(
             response_deadline,
-            response_phase,
+            header_phase,
         )
         .await
         {
@@ -1586,6 +1583,32 @@ impl DomainFronter {
             Err(_) => return Err((FronterError::Timeout, RequestSent::Maybe)),
         };
 
+        // Phase 2b: drain body. Each chunk is individually bounded by
+        // `stream_timeout` (default 300s) so large responses routed
+        // through Apps Script (where a 256 KB range chunk can take 30-90s
+        // of wall-clock time) are not killed by the tighter `batch_timeout`.
+        // Release flow-control credit per chunk so large responses don't
+        // stall after the initial 4 MB window.
+        let stream_timeout = self.stream_timeout();
+        let mut buf: Vec<u8> = Vec::new();
+        loop {
+            match tokio::time::timeout(stream_timeout, body.data()).await {
+                Ok(None) => break,
+                Ok(Some(Ok(chunk))) => {
+                    let n = chunk.len();
+                    buf.extend_from_slice(&chunk);
+                    let _ = body.flow_control().release_capacity(n);
+                }
+                Ok(Some(Err(e))) => {
+                    return Err((
+                        FronterError::Relay(format!("h2 body chunk: {}", e)),
+                        RequestSent::Maybe,
+                    ));
+                }
+                Err(_) => return Err((FronterError::Timeout, RequestSent::Maybe)),
+            }
+        }
+
         // Mirror `read_http_response`: if the server gzipped the body
         // (we asked for it via accept-encoding), decompress before
         // handing back so downstream JSON / envelope parsers see plain
diff --git a/src/proxy_server.rs b/src/proxy_server.rs
@@ -1818,7 +1818,7 @@ async fn dispatch_tunnel(
             host,
             port
         );
-        run_mitm_then_relay(sock, &host, port, mitm, &fronter).await;
+        run_mitm_then_relay(sock, &host, port, mitm, &fronter, &rewrite_ctx.tls_connector, rewrite_ctx.upstream_socks5.as_deref()).await;
         return Ok(());
     }
 
@@ -1832,7 +1832,7 @@ async fn dispatch_tunnel(
             port,
             scheme
         );
-        relay_http_stream_raw(sock, &host, port, scheme, &fronter).await;
+        relay_http_stream_raw(sock, &host, port, scheme, &fronter, &rewrite_ctx.tls_connector, rewrite_ctx.upstream_socks5.as_deref()).await;
         return Ok(());
     }
 
@@ -2115,6 +2115,8 @@ async fn run_mitm_then_relay(
     port: u16,
     mitm: Arc<Mutex<MitmCertManager>>,
     fronter: &DomainFronter,
+    tls_connector: &TlsConnector,
+    upstream_socks5: Option<&str>,
 ) {
     // Peek the TLS ClientHello BEFORE minting the MITM cert. When the client
     // resolves the hostname itself (DoH in Chrome/Firefox) and hands us a raw
@@ -2176,7 +2178,7 @@ async fn run_mitm_then_relay(
     // latter would produce an IP-in-Host request that Cloudflare/etc. reject
     // outright.
     loop {
-        match handle_mitm_request(&mut tls, &effective_host, port, fronter, "https").await {
+        match handle_mitm_request(&mut tls, &effective_host, port, fronter, "https", tls_connector, upstream_socks5).await {
             Ok(true) => continue,
             Ok(false) => break,
             Err(e) => {
@@ -2203,9 +2205,11 @@ async fn relay_http_stream_raw(
     port: u16,
     scheme: &str,
     fronter: &DomainFronter,
+    tls_connector: &TlsConnector,
+    upstream_socks5: Option<&str>,
 ) {
     loop {
-        match handle_mitm_request(&mut sock, host, port, fronter, scheme).await {
+        match handle_mitm_request(&mut sock, host, port, fronter, scheme, tls_connector, upstream_socks5).await {
             Ok(true) => continue,
             Ok(false) => break,
             Err(e) => {
@@ -2377,12 +2381,139 @@ fn parse_host_port(target: &str) -> (String, u16) {
     }
 }
 
+/// Serialise a parsed request back to wire bytes so it can be forwarded to
+/// the real upstream server during WebSocket passthrough. Forwards all headers
+/// except hop-by-hop proxy headers (`Proxy-Connection`, `Proxy-Authorization`).
+fn rebuild_request_bytes(method: &str, path: &str, version: &str, headers: &[(String, String)]) -> Vec<u8> {
+    let mut out = Vec::with_capacity(512);
+    out.extend_from_slice(method.as_bytes());
+    out.push(b' ');
+    out.extend_from_slice(path.as_bytes());
+    out.push(b' ');
+    out.extend_from_slice(version.as_bytes());
+    out.extend_from_slice(b"\r\n");
+    for (k, v) in headers {
+        let kl = k.to_ascii_lowercase();
+        if kl == "proxy-connection" || kl == "proxy-authorization" {
+            continue;
+        }
+        out.extend_from_slice(k.as_bytes());
+        out.extend_from_slice(b": ");
+        out.extend_from_slice(v.as_bytes());
+        out.extend_from_slice(b"\r\n");
+    }
+    out.extend_from_slice(b"\r\n");
+    out
+}
+
+/// After a WebSocket upgrade is detected inside the MITM TLS session, this
+/// helper connects directly to the real `host:port` (optionally via SOCKS5),
+/// performs a TLS handshake, forwards the upgrade request, relays the 101
+/// response back to the client, then splices both directions until one side
+/// closes. Apps Script cannot hold persistent WebSocket connections, so this
+/// bypasses the relay entirely.
+async fn ws_tls_passthrough<S>(
+    client: &mut S,
+    host: &str,
+    port: u16,
+    upgrade_request: &[u8],
+    tls_connector: &TlsConnector,
+    upstream_socks5: Option<&str>,
+) -> std::io::Result<()>
+where
+    S: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin,
+{
+    let connect_timeout = std::time::Duration::from_secs(15);
+
+    let tcp = if let Some(proxy) = upstream_socks5 {
+        match socks5_connect_via(proxy, host, port).await {
+            Ok(s) => s,
+            Err(e) => {
+                tracing::warn!("ws passthrough: socks5 {} -> {}:{} failed: {}", proxy, host, port, e);
+                client.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n").await?;
+                return Ok(());
+            }
+        }
+    } else {
+        match tokio::time::timeout(connect_timeout, TcpStream::connect((host, port))).await {
+            Ok(Ok(s)) => s,
+            Ok(Err(e)) => {
+                tracing::warn!("ws passthrough: direct connect to {}:{} failed: {}", host, port, e);
+                client.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n").await?;
+                return Ok(());
+            }
+            Err(_) => {
+                tracing::warn!("ws passthrough: connect to {}:{} timed out", host, port);
+                client.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n").await?;
+                return Ok(());
+            }
+        }
+    };
+
+    let server_name = match ServerName::try_from(host.to_string()) {
+        Ok(sn) => sn,
+        Err(_) => {
+            tracing::warn!("ws passthrough: invalid server name {}", host);
+            client.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n").await?;
+            return Ok(());
+        }
+    };
+
+    let mut server = match tls_connector.connect(server_name, tcp).await {
+        Ok(s) => s,
+        Err(e) => {
+            tracing::warn!("ws passthrough: TLS to {}:{} failed: {}", host, port, e);
+            client.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n").await?;
+            return Ok(());
+        }
+    };
+
+    // Forward the upgrade request to the real server.
+    server.write_all(upgrade_request).await?;
+    server.flush().await?;
+
+    // Read the server's response headers (up to \r\n\r\n) and forward to client.
+    let mut resp_buf = Vec::with_capacity(512);
+    let mut tmp = [0u8; 1];
+    loop {
+        server.read_exact(&mut tmp).await?;
+        resp_buf.push(tmp[0]);
+        if resp_buf.ends_with(b"\r\n\r\n") {
+            break;
+        }
+        if resp_buf.len() > 8192 {
+            tracing::warn!("ws passthrough: server response headers too large from {}:{}", host, port);
+            return Ok(());
+        }
+    }
+
+    // Check the server actually agreed to the upgrade.
+    let resp_str = String::from_utf8_lossy(&resp_buf);
+    let status_line = resp_str.lines().next().unwrap_or("");
+    if !status_line.contains("101") {
+        tracing::warn!("ws passthrough: {}:{} refused upgrade ({})", host, port, status_line.trim());
+        client.write_all(&resp_buf).await?;
+        client.flush().await?;
+        return Ok(());
+    }
+
+    client.write_all(&resp_buf).await?;
+    client.flush().await?;
+
+    // Both sides agreed: splice raw bytes bidirectionally.
+    tracing::info!("ws passthrough: splicing {}:{}", host, port);
+    let _ = tokio::io::copy_bidirectional(client, &mut server).await;
+    Ok(())
+}
+
 async fn handle_mitm_request<S>(
     stream: &mut S,
     host: &str,
     port: u16,
     fronter: &DomainFronter,
     scheme: &str,
+    tls_connector: &TlsConnector,
+    upstream_socks5: Option<&str>,
 ) -> std::io::Result<bool>
 where
     S: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin,
@@ -2415,11 +2546,24 @@ where
         }
     };
 
-    let (method, path, _version, headers) = match parse_request_head(&head) {
+    let (method, path, version, headers) = match parse_request_head(&head) {
         Some(v) => v,
         None => return Ok(false),
     };
 
+    // WebSocket upgrade: Apps Script cannot relay persistent connections.
+    // Detect before read_body (upgrade requests have no body) and splice
+    // directly to the real server instead.
+    let is_ws_upgrade =
+        header_value(&headers, "connection").map(|v| v.to_ascii_lowercase().contains("upgrade")).unwrap_or(false)
+        && header_value(&headers, "upgrade").map(|v| v.eq_ignore_ascii_case("websocket")).unwrap_or(false);
+    if is_ws_upgrade {
+        tracing::info!("WebSocket upgrade for {}:{} — bypassing Apps Script relay", host, port);
+        let raw_request = rebuild_request_bytes(&method, &path, &version, &headers);
+        ws_tls_passthrough(stream, host, port, &raw_request, tls_connector, upstream_socks5).await?;
+        return Ok(false);
+    }
+
     let body = read_body(stream, &leftover, &headers).await?;
 
     // ── Per-host URL fix-ups ──────────────────────────────────────────