fix(pow): run PoW probe concurrently with the DM wait

grunch · claude · grunch · commit 2050955e9197 · 2026-05-28T13:29:39.000-03:00
Previous attempt (commit 72ced79) capped the postflight info-event lookup at 3s, but it still ran sequentially after the 15s DM wait timed out, so the user-visible failure path could take up to 18s when the relay was slow. Run the probe concurrently with the DM wait instead: by the time the wait elapses the probe's answer is typically already in hand, so the timeout branch consumes a resolved JoinHandle with ~0s added latency. POW_PROBE_TIMEOUT is kept as a safety net for pathological relays that outlive the 15s wait. The probe needs a 'static future for tokio::spawn, so the work moves into a new fetch_required_pow_with(client, mostro_pubkey) — the existing fetch_required_pow(ctx) becomes a thin wrapper around it. On the happy path (DM arrives in time) the spawned probe is aborted so we don't leak a stray relay request. Addresses review feedback on #173. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/docs/pow_error_handling.md b/docs/pow_error_handling.md
@@ -122,20 +122,31 @@ through (`add_invoice`, `take_order`, `take_dispute`, `send_msg`, `new_order`,
 `rate_user`, `orders_info`, `restore`, `last_trade_index`, `add_bond_invoice`).
 Centralizing the fix here covers every command in one place.
 
-Postflight check (chosen — see Alternatives below):
+Concurrent probe (chosen — see Alternatives below):
 
 ```rust
+// Kick off the PoW probe alongside the DM wait so its answer is in hand
+// the moment the wait times out. The probe is cheap to start and cheap to
+// cancel via JoinHandle::abort() on the happy path.
+let pow_probe = tokio::spawn(fetch_required_pow_with(
+    ctx.client.clone(),
+    ctx.mostro_pubkey,
+));
+
+let waited = tokio::time::timeout(FETCH_EVENTS_TIMEOUT, /* notification loop */).await;
+
 let event = match waited {
-    Ok(inner) => inner?,
+    Ok(inner) => {
+        pow_probe.abort();
+        inner?
+    }
     Err(_elapsed) => {
-        // Before declaring this a generic timeout, check whether the daemon
-        // advertises a PoW requirement we didn't meet — that's the real
-        // cause "deadline has elapsed" was hiding. Bounded by
-        // POW_PROBE_TIMEOUT so a slow/unreachable relay can't double the
-        // user-visible wait; if the probe doesn't return in time we fall
-        // through to the generic timeout error instead of hanging.
-        let probe = tokio::time::timeout(POW_PROBE_TIMEOUT, fetch_required_pow(ctx)).await;
-        if let Ok(Some(required)) = probe {
+        // Probe has been running for FETCH_EVENTS_TIMEOUT alongside the
+        // wait; it should already be done. POW_PROBE_TIMEOUT is a safety
+        // net for pathological relays — if the answer isn't in by then,
+        // fall through to the generic timeout error.
+        let probe_result = tokio::time::timeout(POW_PROBE_TIMEOUT, pow_probe).await;
+        if let Ok(Ok(Some(required))) = probe_result {
             let configured = parse_pow_env().unwrap_or(0);
             if required > configured {
                 return Err(PowRequirementUnmet { required, configured }.into());
@@ -146,9 +157,13 @@ let event = match waited {
 };
 ```
 
-`POW_PROBE_TIMEOUT` is a small constant (currently 3 s) — well below
-`FETCH_EVENTS_TIMEOUT` (15 s). Worst-case user-visible wait stays at one
-`FETCH_EVENTS_TIMEOUT` plus the probe budget instead of doubling.
+The probe lives in `events::fetch_required_pow_with(client, mostro_pubkey)`
+— an owned-args sibling of `fetch_required_pow(ctx)`, used so the spawned
+future is `'static`. The 3 s `POW_PROBE_TIMEOUT` is now a safety net rather
+than the typical wait: in the common timeout case the probe is already
+resolved when we look at it, so the user-visible wait stays at
+`FETCH_EVENTS_TIMEOUT` (15 s) plus ~0 s, instead of doubling to 30 s as the
+naive sequential version would.
 
 Add an `&Context` parameter? Look at the signature today —
 `wait_for_dm(ctx, order_trade_keys, sent_message)` — `ctx` is already
diff --git a/src/util/events.rs b/src/util/events.rs
@@ -144,9 +144,24 @@ pub async fn fetch_bond_claim_window_days(ctx: &crate::cli::Context) -> Option<i
 /// Used by [`crate::util::messaging::wait_for_dm`] to distinguish a real
 /// timeout from a silent PoW rejection — see `docs/pow_error_handling.md`.
 pub async fn fetch_required_pow(ctx: &crate::cli::Context) -> Option<u8> {
-    fetch_info_tag(ctx, "pow")
+    fetch_required_pow_with(ctx.client.clone(), ctx.mostro_pubkey).await
+}
+
+/// Owned-args variant of [`fetch_required_pow`], suitable for `tokio::spawn`.
+///
+/// `wait_for_dm` kicks the probe off concurrently with the DM wait so the
+/// answer is already in hand by the time the wait times out (zero added
+/// latency in the timeout path, instead of a sequential second fetch).
+pub async fn fetch_required_pow_with(client: Client, mostro_pubkey: PublicKey) -> Option<u8> {
+    let filter = Filter::new()
+        .author(mostro_pubkey)
+        .kind(nostr_sdk::Kind::Custom(NOSTR_INFO_EVENT_KIND));
+    let events = client
+        .fetch_events(filter, FETCH_EVENTS_TIMEOUT)
         .await
-        .and_then(|v| v.parse::<u8>().ok())
+        .ok()?;
+    let event = events.iter().max_by_key(|e| e.created_at)?;
+    read_info_tag_from_event(event, "pow").and_then(|v| v.parse::<u8>().ok())
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/src/util/messaging.rs b/src/util/messaging.rs
@@ -329,6 +329,17 @@ where
     // Send message here after opening notifications to avoid missing messages.
     sent_message.await?;
 
+    // Kick off the PoW probe concurrently with the DM wait. By running the
+    // kind-38385 lookup alongside the 15s `FETCH_EVENTS_TIMEOUT` instead of
+    // *after* it, the timeout branch doesn't pay a second sequential
+    // `fetch_events` round-trip — by then the probe has typically already
+    // returned. `JoinHandle` lets us `abort()` the probe cheaply on the happy
+    // path (DM arrives in time) without leaking the task.
+    let pow_probe = tokio::spawn(super::events::fetch_required_pow_with(
+        ctx.client.clone(),
+        ctx.mostro_pubkey,
+    ));
+
     // Wait for the DM or gift wrap event
     let waited = tokio::time::timeout(super::events::FETCH_EVENTS_TIMEOUT, async move {
         loop {
@@ -346,22 +357,23 @@ where
     // Keep a genuine timeout (the only "no reply" outcome) distinguishable from
     // a notification-channel error so callers can treat them differently.
     let event = match waited {
-        Ok(inner) => inner?,
+        Ok(inner) => {
+            // Happy path: DM arrived. Cancel the probe; the answer is no
+            // longer needed and we don't want a stray relay request lingering.
+            pow_probe.abort();
+            inner?
+        }
         Err(_elapsed) => {
             // mostrod silently drops events whose outer GiftWrap doesn't meet
             // its NIP-13 PoW requirement (relay accepts → daemon discards →
-            // no reply ever comes). Before declaring this a generic timeout,
-            // ask the daemon's kind-38385 info event whether that's actually
-            // the cause we're hiding behind "no reply".
-            //
-            // The probe is bounded by `POW_PROBE_TIMEOUT` instead of the full
-            // `FETCH_EVENTS_TIMEOUT` so a slow/unreachable relay can't double
-            // the user-visible wait. If the probe doesn't return in time, fall
-            // through to the generic timeout error rather than hanging.
-            let probe =
-                tokio::time::timeout(POW_PROBE_TIMEOUT, super::events::fetch_required_pow(ctx))
-                    .await;
-            if let Ok(Some(required)) = probe {
+            // no reply ever comes). The probe has already been running for
+            // `FETCH_EVENTS_TIMEOUT` alongside the wait, so it is almost
+            // certainly done. Cap the await with `POW_PROBE_TIMEOUT` as a
+            // safety net so a pathological relay can't keep us hanging — if
+            // the probe isn't back by then, fall through to the generic
+            // timeout error instead of waiting any longer.
+            let probe_result = tokio::time::timeout(POW_PROBE_TIMEOUT, pow_probe).await;
+            if let Ok(Ok(Some(required))) = probe_result {
                 let configured = parse_pow_env().unwrap_or(0);
                 if required > configured {
                     return Err(PowRequirementUnmet {