|
| 1 | +#!/usr/bin/env bash |
| 2 | +# |
| 3 | +# repro_issue_185.sh — Reproduce the TCP-reset bug behind issue #185 / PR #389. |
| 4 | +# |
| 5 | +# Symptom: `hey`/clients see "connection reset by peer" (and pooled keepalive |
| 6 | +# clients log `Unsolicited response received on idle HTTP channel starting with |
| 7 | +# "HTTP/1.1 ..."`) when SLEdge answers a request *before* it has finished reading |
| 8 | +# the request body. |
| 9 | +# |
| 10 | +# Why a plain bodyless GET does NOT reproduce it: |
| 11 | +# 429/500/503 are emitted from on_client_request_received(), which only runs |
| 12 | +# after the FULL request (body included) has been read. So those are clean. |
| 13 | +# |
| 14 | +# The reproducible path is 404. In on_client_request_receiving() |
| 15 | +# (runtime/src/listener_thread.c, ~line 200) the route is matched the moment the |
| 16 | +# URL is parsed from the *request line* — before the body arrives: |
| 17 | +# |
| 18 | +# if (session->route == NULL && strlen(session->http_request.full_url) > 0) { |
| 19 | +# route = http_router_match_route(...); |
| 20 | +# if (route == NULL) { ...404...; on_client_response_header_sending(); return; } |
| 21 | +# } |
| 22 | +# |
| 23 | +# So: POST a sizeable body to a NON-existent route. SLEdge writes the 404 and |
| 24 | +# close()es the socket while the client is still sending. On Linux, close() with |
| 25 | +# unread data in the kernel receive buffer discards it and emits a RST instead of |
| 26 | +# a graceful FIN -> "connection reset by peer". |
| 27 | +# |
| 28 | +# Expected result: |
| 29 | +# * On master / fix/docker-dev-setup (unpatched): a nonzero number of |
| 30 | +# "connection reset by peer" errors -> BUG REPRODUCED. |
| 31 | +# * On fix/issue-185-graceful-close (patched tcp_session_close): 0 resets. |
| 32 | +# |
| 33 | +# Usage: |
| 34 | +# ./repro_issue_185.sh |
| 35 | +# |
| 36 | +# Tunables (env vars): |
| 37 | +# PORT=10000 tenant listen port |
| 38 | +# REQUESTS=2400 total requests (hey -n) |
| 39 | +# CONCURRENCY=32 concurrent connections (hey -c) |
| 40 | +# BODY_BYTES=100000 request body size (~100 KB); larger = more resets |
| 41 | +# |
| 42 | +set -euo pipefail |
| 43 | + |
| 44 | +PORT="${PORT:-10000}" |
| 45 | +REQUESTS="${REQUESTS:-2400}" |
| 46 | +CONCURRENCY="${CONCURRENCY:-32}" |
| 47 | +BODY_BYTES="${BODY_BYTES:-100000}" |
| 48 | + |
| 49 | +# --- locate the repo (this script lives at the repo root) --------------------- |
| 50 | +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" |
| 51 | +BIN_DIR="$REPO_ROOT/runtime/bin" |
| 52 | +SLEDGERT="$BIN_DIR/sledgert" |
| 53 | +WASM="$BIN_DIR/empty.wasm.so" |
| 54 | + |
| 55 | +red() { printf '\033[1;31m%s\033[0m\n' "$*"; } |
| 56 | +green() { printf '\033[0;32m%s\033[0m\n' "$*"; } |
| 57 | +info() { printf '\033[0;36m==> %s\033[0m\n' "$*"; } |
| 58 | + |
| 59 | +# --- prerequisites ------------------------------------------------------------ |
| 60 | +if ! command -v hey >/dev/null 2>&1; then |
| 61 | + red "ERROR: 'hey' is not installed. Install it with:" |
| 62 | + echo " go install github.com/rakyll/hey@latest (then add \$(go env GOPATH)/bin to PATH)" |
| 63 | + echo " or: apt-get install -y hey" |
| 64 | + exit 1 |
| 65 | +fi |
| 66 | + |
| 67 | +if [[ ! -x "$SLEDGERT" ]]; then |
| 68 | + red "ERROR: $SLEDGERT not found. Build the runtime first, e.g.:" |
| 69 | + echo " make runtime # builds runtime/bin/sledgert" |
| 70 | + echo " make install # full build incl. wasm apps" |
| 71 | + exit 1 |
| 72 | +fi |
| 73 | + |
| 74 | +if [[ ! -f "$WASM" ]]; then |
| 75 | + red "ERROR: $WASM not found. Build the sample apps first, e.g.:" |
| 76 | + echo " make install # builds the wasm apps incl. empty.wasm.so" |
| 77 | + exit 1 |
| 78 | +fi |
| 79 | + |
| 80 | +# Warn (don't block) if running on a branch that already contains the fix. |
| 81 | +if grep -q 'shutdown(client_socket' "$REPO_ROOT/runtime/include/tcp_session.h" 2>/dev/null; then |
| 82 | + info "NOTE: tcp_session.h contains the graceful-close fix — you are on a PATCHED" |
| 83 | + info " checkout, so you should see 0 resets (the fix working). To see the BUG," |
| 84 | + info " check out an unpatched branch (e.g. master) and 'make runtime' first." |
| 85 | +fi |
| 86 | + |
| 87 | +# --- write a minimal tenant spec ---------------------------------------------- |
| 88 | +SPEC="$(mktemp /tmp/issue185-spec.XXXXXX.json)" |
| 89 | +cat > "$SPEC" <<EOF |
| 90 | +[ |
| 91 | + { |
| 92 | + "name": "gwu", |
| 93 | + "port": $PORT, |
| 94 | + "routes": [ |
| 95 | + { |
| 96 | + "route": "/empty", |
| 97 | + "path": "empty.wasm.so", |
| 98 | + "admissions-percentile": 70, |
| 99 | + "relative-deadline-us": 50000, |
| 100 | + "http-resp-content-type": "text/plain" |
| 101 | + } |
| 102 | + ] |
| 103 | + } |
| 104 | +] |
| 105 | +EOF |
| 106 | + |
| 107 | +# --- generate the request body ------------------------------------------------ |
| 108 | +BODY="$(mktemp /tmp/issue185-body.XXXXXX)" |
| 109 | +head -c "$BODY_BYTES" /dev/zero | tr '\0' 'x' > "$BODY" |
| 110 | + |
| 111 | +# --- launch sledgert ---------------------------------------------------------- |
| 112 | +LOG="$(mktemp /tmp/issue185-sledge.XXXXXX.log)" |
| 113 | +SLEDGE_PID="" |
| 114 | + |
| 115 | +cleanup() { |
| 116 | + [[ -n "$SLEDGE_PID" ]] && kill "$SLEDGE_PID" 2>/dev/null || true |
| 117 | + rm -f "$SPEC" "$BODY" "$LOG" |
| 118 | +} |
| 119 | +trap cleanup EXIT |
| 120 | + |
| 121 | +info "Starting sledgert on port $PORT (route /empty -> empty.wasm.so)" |
| 122 | +# sledgert resolves the relative "empty.wasm.so" path against its CWD, and needs |
| 123 | +# runtime/bin on LD_LIBRARY_PATH for libsledge/libck. 'exec' makes sledgert |
| 124 | +# replace the subshell so $! is sledgert's own PID (so cleanup kills it). |
| 125 | +( cd "$BIN_DIR" && exec env LD_LIBRARY_PATH="$BIN_DIR:${LD_LIBRARY_PATH:-}" "$SLEDGERT" "$SPEC" ) >"$LOG" 2>&1 & |
| 126 | +SLEDGE_PID=$! |
| 127 | + |
| 128 | +# --- wait for the tenant port to accept connections --------------------------- |
| 129 | +info "Waiting for port $PORT to come up..." |
| 130 | +for _ in $(seq 1 50); do |
| 131 | + if ! kill -0 "$SLEDGE_PID" 2>/dev/null; then |
| 132 | + red "sledgert exited during startup. Log:" |
| 133 | + cat "$LOG" |
| 134 | + exit 1 |
| 135 | + fi |
| 136 | + if (exec 3<>"/dev/tcp/127.0.0.1/$PORT") 2>/dev/null; then |
| 137 | + exec 3>&- 3<&- 2>/dev/null || true |
| 138 | + break |
| 139 | + fi |
| 140 | + sleep 0.2 |
| 141 | +done |
| 142 | + |
| 143 | +# Sanity checks: valid route -> 200, missing route (bodyless) -> 404 (clean). |
| 144 | +ok="$(curl -s -o /dev/null -w '%{http_code}' -X POST --data hi "http://127.0.0.1:$PORT/empty" || true)" |
| 145 | +nf="$(curl -s -o /dev/null -w '%{http_code}' "http://127.0.0.1:$PORT/nope" || true)" |
| 146 | +info "Sanity: POST /empty -> $ok , GET /nope (bodyless) -> $nf" |
| 147 | +if [[ "$ok" != "200" ]]; then |
| 148 | + red "sledgert is not serving the valid route as expected. Log:" |
| 149 | + cat "$LOG" |
| 150 | + exit 1 |
| 151 | +fi |
| 152 | + |
| 153 | +# --- the load that triggers the bug ------------------------------------------- |
| 154 | +info "Firing $REQUESTS POSTs ($BODY_BYTES-byte body, concurrency $CONCURRENCY) at /nope (non-existent route)" |
| 155 | +HEY_OUT="$(mktemp /tmp/issue185-hey.XXXXXX)" |
| 156 | +hey -n "$REQUESTS" -c "$CONCURRENCY" -m POST -D "$BODY" "http://127.0.0.1:$PORT/nope" > "$HEY_OUT" 2>&1 || true |
| 157 | + |
| 158 | +echo |
| 159 | +echo "----- hey status code distribution -----" |
| 160 | +sed -n '/Status code distribution/,/^$/p' "$HEY_OUT" || true |
| 161 | + |
| 162 | +# Each reset is a distinct connection (unique source port), so count lines. |
| 163 | +RESETS="$(grep -c 'connection reset by peer' "$HEY_OUT" || true)" |
| 164 | +EPIPES="$(grep -c 'broken pipe' "$HEY_OUT" || true)" |
| 165 | +rm -f "$HEY_OUT" |
| 166 | + |
| 167 | +echo |
| 168 | +echo "============================================================" |
| 169 | +echo " 'connection reset by peer' errors : $RESETS" |
| 170 | +echo " 'broken pipe' (EPIPE) errors : $EPIPES (known large-body limitation)" |
| 171 | +echo "============================================================" |
| 172 | +if [[ "${RESETS:-0}" -gt 0 ]]; then |
| 173 | + red "BUG REPRODUCED: SLEdge sent RSTs on early 404 responses (issue #185)." |
| 174 | + echo "On fix/issue-185-graceful-close this count drops to 0." |
| 175 | +else |
| 176 | + green "No resets observed. If you are on the patched branch, this is the FIX working." |
| 177 | + echo "If you expected the bug: confirm you 'make runtime' on an unpatched branch," |
| 178 | + echo "and try a larger BODY_BYTES (e.g. BODY_BYTES=1000000) or higher CONCURRENCY." |
| 179 | +fi |
0 commit comments