fix(flows): 1-retry wrapper for agent buy prompt in flow-13/14

bussyjd · bussyjd · commit 7a7d51be9c20 · 2026-05-15T14:25:47.000+08:00
The agent step at flow-13/14 step 46 sends a long single-shot prompt
to the obol-agent (qwen36-fast, ~4B params) telling it to invoke
buy.py via its terminal tool. qwen36-fast occasionally narrates a
fabricated failure (HTTP 404 path-doubling, eRPC DNS error, etc.)
instead of actually running the bash command. When that happens, no
PurchaseRequest is created and step 47 fails with "PurchaseRequest CR
not ready" — even though buy.py was never invoked.

This commit factors the prompt into agent_buy_with_retry() in
lib-dual-stack.sh and replaces both flow-13 and flow-14 step 46 with
a single call. The wrapper:

  1. Sends the prompt as before.
  2. Polls bob's hermes-obol-agent namespace for the alice-obol PR
     for up to 60s.
  3. If the PR doesn't appear, prints a LOUD warning box flagging
     this as documented agent unreliability and re-sends the prompt
     once.
  4. If still absent, step 47 fails as before.

Net effect: probabilistic single-attempt FAILs become reliable PASSes
on real flake while still failing loudly on a real regression. The
WARN box on retry is the audit trail — if it fires regularly, the
smoke needs a more reliable LLM (qwen36-deep / qwen36-35b-heretic)
or a non-agent fallback.

Refers: plans/inference-v1337-followup-20260514.md (the v1337 buy
attempt-5 SIGKILL false-positive was the same flake class)

Saves ~50 lines of duplication between the two flow scripts.
diff --git a/flows/flow-13-dual-stack-obol.sh b/flows/flow-13-dual-stack-obol.sh
@@ -899,31 +899,7 @@ pass "Agent discovery prompt issued (success will be confirmed by buy + Purchase
 # ═════════════════════════════════════════════════════════════════
 
 step "Bob's agent: buy 5 OBOL Permit2 auths from Alice"
-buy_response=$(curl -sf --max-time 300 \
-    -X POST "http://localhost:${BOB_AGENT_PORT}/v1/chat/completions" \
-    -H "Authorization: Bearer $BOB_TOKEN" \
-    -H "Content-Type: application/json" \
-    -d "{
-        \"model\": \"$BOB_AGENT_RUNTIME-agent\",
-        \"messages\": [{
-            \"role\": \"user\",
-            \"content\": \"Use the buy-x402 skill and your terminal tool. Run exactly once: ERPC_URL=http://erpc.erpc.svc.cluster.local/rpc ERPC_NETWORK=base-sepolia python3 $BOB_OBOL_SKILLS_DIR/buy-x402/scripts/buy.py buy alice-obol --endpoint $TUNNEL_URL/services/alice-obol-inference/v1/chat/completions --model $OBOL_LLM_MODEL --count 5\"
-        }],
-        \"max_tokens\": 4000,
-        \"stream\": false
-    }" 2>&1 || true)
-buy_content=$(extract_assistant_content "$buy_response" 2>/dev/null || true)
-echo "${buy_content:0:500}"
-# Don't grep buy_content for natural-language confirmation; structural success
-# is the PurchaseRequest CR Ready=True poll below.
-if [ -z "$(printf '%s' "$buy_content" | tr -d '[:space:]')" ]; then
-    echo "  ! Agent returned no final assistant text; confirming purchase via PurchaseRequest CR"
-fi
-if printf '%s' "$buy_content" | agent_response_refused; then
-    fail "Agent refused to run buy.py: ${buy_content:0:500}"
-    emit_metrics; exit 1
-fi
-pass "Agent buy prompt issued (success will be confirmed by PurchaseRequest CR)"
+agent_buy_with_retry
 
 # ═════════════════════════════════════════════════════════════════
 # 36-39. PR Ready / LiteLLM rollout / sidecar auths / paid call
diff --git a/flows/flow-14-live-obol-base-sepolia.sh b/flows/flow-14-live-obol-base-sepolia.sh
@@ -953,29 +953,7 @@ pass "Agent discovery prompt issued (success will be confirmed by buy + Purchase
 # ═════════════════════════════════════════════════════════════════
 
 step "Bob's agent: buy 5 OBOL Permit2 auths from Alice"
-buy_response=$(curl -sf --max-time 300 \
-    -X POST "http://localhost:${BOB_AGENT_PORT}/v1/chat/completions" \
-    -H "Authorization: Bearer $BOB_TOKEN" \
-    -H "Content-Type: application/json" \
-    -d "{
-        \"model\": \"$BOB_AGENT_RUNTIME-agent\",
-        \"messages\": [{
-            \"role\": \"user\",
-            \"content\": \"Use the buy-x402 skill and your terminal tool. Run exactly once: ERPC_URL=http://erpc.erpc.svc.cluster.local/rpc ERPC_NETWORK=base-sepolia python3 $BOB_OBOL_SKILLS_DIR/buy-x402/scripts/buy.py buy alice-obol --endpoint $TUNNEL_URL/services/alice-obol-inference/v1/chat/completions --model $OBOL_LLM_MODEL --count 5\"
-        }],
-        \"max_tokens\": 4000,
-        \"stream\": false
-    }" 2>&1 || true)
-buy_content=$(extract_assistant_content "$buy_response" 2>/dev/null || true)
-echo "${buy_content:0:500}"
-if [ -z "$(printf '%s' "$buy_content" | tr -d '[:space:]')" ]; then
-    echo "  ! Agent returned no final assistant text; confirming purchase via PurchaseRequest CR"
-fi
-if printf '%s' "$buy_content" | agent_response_refused; then
-    fail "Agent refused to run buy.py: ${buy_content:0:500}"
-    emit_metrics; exit 1
-fi
-pass "Agent buy prompt issued (success will be confirmed by PurchaseRequest CR)"
+agent_buy_with_retry
 
 # ═════════════════════════════════════════════════════════════════
 # 31-34. PR Ready / LiteLLM rollout / sidecar auths / paid call
diff --git a/flows/lib-dual-stack.sh b/flows/lib-dual-stack.sh
@@ -347,6 +347,85 @@ except Exception as e:
 " 2>&1 || true
 }
 
+# Send the long single-shot buy prompt to Bob's agent. The prompt expands
+# against the caller's environment (BOB_AGENT_PORT, BOB_TOKEN,
+# BOB_AGENT_RUNTIME, BOB_OBOL_SKILLS_DIR, TUNNEL_URL, OBOL_LLM_MODEL).
+_agent_buy_send_prompt() {
+    curl -sf --max-time 300 \
+        -X POST "http://localhost:${BOB_AGENT_PORT}/v1/chat/completions" \
+        -H "Authorization: Bearer $BOB_TOKEN" \
+        -H "Content-Type: application/json" \
+        -d "{
+            \"model\": \"$BOB_AGENT_RUNTIME-agent\",
+            \"messages\": [{
+                \"role\": \"user\",
+                \"content\": \"Use the buy-x402 skill and your terminal tool. Run exactly once: ERPC_URL=http://erpc.erpc.svc.cluster.local/rpc ERPC_NETWORK=base-sepolia python3 $BOB_OBOL_SKILLS_DIR/buy-x402/scripts/buy.py buy alice-obol --endpoint $TUNNEL_URL/services/alice-obol-inference/v1/chat/completions --model $OBOL_LLM_MODEL --count 5\"
+            }],
+            \"max_tokens\": 4000,
+            \"stream\": false
+        }" 2>&1 || true
+}
+
+_agent_buy_pr_exists() {
+    bob kubectl get purchaserequests.obol.org -n "$BOB_AGENT_NS" alice-obol \
+        -o name 2>/dev/null | grep -q .
+}
+
+# 1-retry wrapper for the agent buy prompt at flow-13/14 step 46. qwen36-fast
+# (4B-class) occasionally narrates a fabricated failure on the long single-shot
+# buy prompt instead of actually invoking the bash tool. When that happens, no
+# PurchaseRequest is created and step 47 fails with "PurchaseRequest CR not
+# ready" — even though buy.py was never invoked. See
+# plans/inference-v1337-followup-20260514.md.
+#
+# Strategy: poll for the PR for up to 60s after the first prompt; if absent,
+# print a LOUD warning flagging this as agent unreliability and re-send the
+# prompt once. If still absent after the retry, step 47 fails as before.
+agent_buy_with_retry() {
+    local response content retried=0 i
+
+    response=$(_agent_buy_send_prompt)
+    content=$(extract_assistant_content "$response" 2>/dev/null || true)
+    echo "${content:0:500}"
+    if [ -z "$(printf '%s' "$content" | tr -d '[:space:]')" ]; then
+        echo "  ! Agent returned no final assistant text; confirming purchase via PurchaseRequest CR"
+    fi
+    if printf '%s' "$content" | agent_response_refused; then
+        fail "Agent refused to run buy.py: ${content:0:500}"
+        emit_metrics; exit 1
+    fi
+
+    # Wait up to 60s for the controller to reconcile the PR. Healthy runs see
+    # it within ~5s; the long ceiling absorbs cluster-cold-start jitter.
+    for i in $(seq 1 12); do
+        _agent_buy_pr_exists && break
+        sleep 5
+    done
+
+    if ! _agent_buy_pr_exists; then
+        echo ""
+        echo "  ╔════════════════════════════════════════════════════════════════════════╗"
+        echo "  ║  WARN: agent did NOT create a PurchaseRequest after 60s.               ║"
+        echo "  ║  Documented qwen36-fast (4B) flake — agent narrates a fabricated       ║"
+        echo "  ║  failure instead of invoking buy.py. Re-prompting ONCE.                ║"
+        echo "  ║  If this fires regularly, switch to a more reliable LLM (qwen36-deep   ║"
+        echo "  ║  / qwen36-35b-heretic) or add a non-agent fallback path.               ║"
+        echo "  ║  Ref: plans/inference-v1337-followup-20260514.md                       ║"
+        echo "  ╚════════════════════════════════════════════════════════════════════════╝"
+        echo ""
+        retried=1
+        response=$(_agent_buy_send_prompt)
+        content=$(extract_assistant_content "$response" 2>/dev/null || true)
+        echo "  RETRY response: ${content:0:500}"
+        if printf '%s' "$content" | agent_response_refused; then
+            fail "Agent refused to run buy.py on retry: ${content:0:500}"
+            emit_metrics; exit 1
+        fi
+    fi
+
+    pass "Agent buy prompt issued (retry=$retried; success will be confirmed by PurchaseRequest CR)"
+}
+
 extract_assistant_content() {
     DUAL_STACK_RESPONSE="$1" python3 - <<'PY'
 import json