|
| 1 | +#!/bin/sh |
| 2 | +# ba-recovery: detect and recover broken BA sessions after cross-band roaming |
| 3 | +# |
| 4 | +# ROOT CAUSE: |
| 5 | +# QCN9274 firmware (WLAN.WBE.1.4.1) maintains internal BA/TX aggregation |
| 6 | +# state indexed by STA MAC across both 5G and 6G MACs. When the same STA |
| 7 | +# re-appears on a different MAC after a band-hop roam, the firmware does |
| 8 | +# NOT reset this state. The PHY rate stays high but actual throughput |
| 9 | +# collapses to 0-5 Mbps because the firmware's BA window tracking is stale. |
| 10 | +# |
| 11 | +# Kernel patch 994 (force-fresh TID on ampdu_start) handles ~50-70% of |
| 12 | +# cases by resetting the REO queue. This script is the userspace safety net |
| 13 | +# for the remaining cases where firmware still gets stuck. |
| 14 | +# |
| 15 | +# FLOW: |
| 16 | +# 1. Watches hostapd syslog for STA association messages |
| 17 | +# 2. Detects band-hops (STA moves between phy5g-apX and phy6g-apX) |
| 18 | +# 3. Only acts if STA was previously passing real traffic (>= PKT_THRESHOLD) |
| 19 | +# 4. After CHECK_DELAY seconds, samples RX packet count twice (1s apart) |
| 20 | +# 5. If delta < PKT_THRESHOLD -> disassociates STA to force clean reconnect |
| 21 | +# 6. Stops kicking after MAX_KICKS consecutive failures per STA |
| 22 | +# |
| 23 | +# Usage: ba-recovery [CHECK_DELAY_SEC] [PKT_THRESHOLD] |
| 24 | +# Defaults: CHECK_DELAY=2, PKT_THRESHOLD=2000 |
| 25 | +# |
| 26 | +# Install as init.d service for persistent background operation. |
| 27 | + |
| 28 | +CHECK_DELAY="${1:-2}" |
| 29 | +PKT_THRESHOLD="${2:-2000}" |
| 30 | +MAX_KICKS=5 |
| 31 | + |
| 32 | +# Only run on EAP105 — this workaround is specific to QCN9274 FW bug |
| 33 | +BOARD=$(cat /tmp/sysinfo/board_name 2>/dev/null) |
| 34 | +case "$BOARD" in |
| 35 | + edgecore,eap105) ;; |
| 36 | + *) |
| 37 | + logger -t ba-recovery "board '$BOARD' is not EAP105, exiting" |
| 38 | + exit 0 |
| 39 | + ;; |
| 40 | +esac |
| 41 | + |
| 42 | +log() { |
| 43 | + logger -t ba-recovery "$*" |
| 44 | + echo "[$(date +%s)] $*" |
| 45 | +} |
| 46 | + |
| 47 | +get_rx_pkts() { |
| 48 | + local mac="$1" |
| 49 | + local iface="$2" |
| 50 | + local mac_upper pkts |
| 51 | + |
| 52 | + mac_upper=$(echo "$mac" | tr 'a-f' 'A-F') |
| 53 | + |
| 54 | + pkts=$(iwinfo "$iface" assoc 2>/dev/null | \ |
| 55 | + grep -A2 "$mac_upper" | \ |
| 56 | + grep "RX:" | \ |
| 57 | + head -1 | \ |
| 58 | + sed -n 's/.*[[:space:]]\([0-9]*\) Pkts\..*/\1/p') |
| 59 | + |
| 60 | + [ -z "$pkts" ] && pkts="0" |
| 61 | + echo "$pkts" |
| 62 | +} |
| 63 | + |
| 64 | +do_recover() { |
| 65 | + local mac="$1" |
| 66 | + local iface="$2" |
| 67 | + local delta="$3" |
| 68 | + local kicks last_kick now elapsed |
| 69 | + |
| 70 | + # Increment kick counter |
| 71 | + kicks=0 |
| 72 | + [ -f "$STATE_DIR/${mac}.kicks" ] && kicks=$(cat "$STATE_DIR/${mac}.kicks") |
| 73 | + |
| 74 | + # Reset counter if last kick was more than 30 seconds ago (not a rapid-fire loop) |
| 75 | + now=$(date +%s) |
| 76 | + last_kick=0 |
| 77 | + [ -f "$STATE_DIR/${mac}.last_kick" ] && last_kick=$(cat "$STATE_DIR/${mac}.last_kick") |
| 78 | + elapsed=$((now - last_kick)) |
| 79 | + if [ "$elapsed" -gt 30 ]; then |
| 80 | + kicks=0 |
| 81 | + fi |
| 82 | + |
| 83 | + kicks=$((kicks + 1)) |
| 84 | + echo "$kicks" > "$STATE_DIR/${mac}.kicks" |
| 85 | + echo "$now" > "$STATE_DIR/${mac}.last_kick" |
| 86 | + |
| 87 | + if [ "$kicks" -gt "$MAX_KICKS" ]; then |
| 88 | + log "SKIP: $mac rapid-fire kicks=$kicks > max=$MAX_KICKS within 30s, backing off" |
| 89 | + return |
| 90 | + fi |
| 91 | + |
| 92 | + log "RECOVER: $mac on $iface pkt_delta=${delta} < threshold=${PKT_THRESHOLD} (kick $kicks/$MAX_KICKS)" |
| 93 | + log "RECOVER: disassociating $mac from $iface" |
| 94 | + hostapd_cli -i "$iface" disassociate "$mac" reason=4 tx=0 |
| 95 | +} |
| 96 | + |
| 97 | +check_sta_pkts() { |
| 98 | + local mac="$1" |
| 99 | + local iface="$2" |
| 100 | + local pkts1 pkts2 delta |
| 101 | + |
| 102 | + pkts1=$(get_rx_pkts "$mac" "$iface") |
| 103 | + if [ "$pkts1" = "0" ]; then |
| 104 | + log "check: $mac on $iface — STA not found in assoc table" |
| 105 | + return |
| 106 | + fi |
| 107 | + |
| 108 | + sleep 1 |
| 109 | + |
| 110 | + pkts2=$(get_rx_pkts "$mac" "$iface") |
| 111 | + if [ "$pkts2" = "0" ]; then |
| 112 | + log "check: $mac on $iface — STA gone after 1s" |
| 113 | + return |
| 114 | + fi |
| 115 | + |
| 116 | + delta=$((pkts2 - pkts1)) |
| 117 | + |
| 118 | + # Skip truly idle STAs (no traffic at all) — they aren't broken, just inactive |
| 119 | + if [ "$delta" -eq 0 ] && [ ! -f "$STATE_DIR/${mac}.active" ]; then |
| 120 | + log "SKIP: $mac on $iface — idle STA (delta=0, never active), ignoring" |
| 121 | + return |
| 122 | + fi |
| 123 | + |
| 124 | + if [ "$delta" -lt "$PKT_THRESHOLD" ]; then |
| 125 | + do_recover "$mac" "$iface" "$delta" |
| 126 | + else |
| 127 | + log "OK: $mac on $iface pkt_delta=${delta}/s (healthy)" |
| 128 | + # Reset kick counter on success |
| 129 | + rm -f "$STATE_DIR/${mac}.kicks" |
| 130 | + # Mark as active for future reference |
| 131 | + echo "$delta" > "$STATE_DIR/${mac}.active" |
| 132 | + fi |
| 133 | +} |
| 134 | + |
| 135 | +update_active_state() { |
| 136 | + local mac="$1" |
| 137 | + local iface="$2" |
| 138 | + local pkts1 pkts2 delta |
| 139 | + |
| 140 | + pkts1=$(get_rx_pkts "$mac" "$iface") |
| 141 | + [ "$pkts1" = "0" ] && return |
| 142 | + |
| 143 | + sleep 1 |
| 144 | + |
| 145 | + pkts2=$(get_rx_pkts "$mac" "$iface") |
| 146 | + [ "$pkts2" = "0" ] && return |
| 147 | + |
| 148 | + delta=$((pkts2 - pkts1)) |
| 149 | + if [ "$delta" -ge "$PKT_THRESHOLD" ]; then |
| 150 | + echo "$delta" > "$STATE_DIR/${mac}.active" |
| 151 | + fi |
| 152 | +} |
| 153 | + |
| 154 | +# --- Main --- |
| 155 | +log "=== ba-recovery daemon starting (pid=$$) ===" |
| 156 | +log "CONFIG: CHECK_DELAY=${CHECK_DELAY}s, PKT_THRESHOLD=${PKT_THRESHOLD}pkts/s, MAX_KICKS=${MAX_KICKS}" |
| 157 | + |
| 158 | +# Discover interfaces (retry up to 30 times waiting for hostapd) |
| 159 | +IFACE_5G="" |
| 160 | +IFACE_6G="" |
| 161 | +RETRIES=30 |
| 162 | +while [ "$RETRIES" -gt 0 ]; do |
| 163 | + for obj in $(ubus list 2>/dev/null | grep "^hostapd\\."); do |
| 164 | + iface="${obj#hostapd.}" |
| 165 | + case "$iface" in |
| 166 | + *5g*) IFACE_5G="$iface" ;; |
| 167 | + *6g*) IFACE_6G="$iface" ;; |
| 168 | + esac |
| 169 | + done |
| 170 | + [ -n "$IFACE_5G" ] && [ -n "$IFACE_6G" ] && break |
| 171 | + RETRIES=$((RETRIES - 1)) |
| 172 | + sleep 2 |
| 173 | +done |
| 174 | + |
| 175 | +if [ -z "$IFACE_5G" ] && [ -z "$IFACE_6G" ]; then |
| 176 | + log "ERROR: no hostapd interfaces found after 60s" |
| 177 | + exit 1 |
| 178 | +fi |
| 179 | +[ -n "$IFACE_5G" ] && log " 5G interface: $IFACE_5G" |
| 180 | +[ -n "$IFACE_6G" ] && log " 6G interface: $IFACE_6G" |
| 181 | + |
| 182 | +# State tracking |
| 183 | +STATE_DIR="/tmp/ba-recovery-state" |
| 184 | +rm -rf "$STATE_DIR" |
| 185 | +mkdir -p "$STATE_DIR" |
| 186 | + |
| 187 | +cleanup() { |
| 188 | + rm -rf "$STATE_DIR" |
| 189 | + kill $(jobs -p) 2>/dev/null |
| 190 | + exit 0 |
| 191 | +} |
| 192 | +trap cleanup INT TERM |
| 193 | + |
| 194 | +log "=== ready, watching logread for hostapd events ===" |
| 195 | + |
| 196 | +# Watch syslog for hostapd "associated" messages |
| 197 | +# Format: "hostapd: phy5g-ap0: STA xx:xx:xx:xx:xx:xx IEEE 802.11: associated (aid N)" |
| 198 | +logread -f | while IFS= read -r line; do |
| 199 | + case "$line" in |
| 200 | + *"IEEE 802.11: associated (aid"*) |
| 201 | + # Extract MAC and interface from hostapd log |
| 202 | + mac=$(echo "$line" | sed -n 's/.*STA \([0-9a-fA-F:]*\) IEEE.*/\1/p' | tr 'A-F' 'a-f') |
| 203 | + iface=$(echo "$line" | sed -n 's/.*hostapd: \([^ :]*\): STA.*/\1/p') |
| 204 | + |
| 205 | + [ -z "$mac" ] && continue |
| 206 | + [ -z "$iface" ] && continue |
| 207 | + |
| 208 | + # Determine band from interface name |
| 209 | + band="" |
| 210 | + case "$iface" in |
| 211 | + *5g*) band="5g" ;; |
| 212 | + *6g*) band="6g" ;; |
| 213 | + *) continue ;; |
| 214 | + esac |
| 215 | + |
| 216 | + # Check for band hop |
| 217 | + prev_band="" |
| 218 | + [ -f "$STATE_DIR/$mac" ] && prev_band=$(cat "$STATE_DIR/$mac") |
| 219 | + echo "$band" > "$STATE_DIR/$mac" |
| 220 | + |
| 221 | + if [ -n "$prev_band" ] && [ "$prev_band" != "$band" ]; then |
| 222 | + log "BAND-HOP: $mac moved $prev_band -> $band ($iface)" |
| 223 | + ( |
| 224 | + sleep "$CHECK_DELAY" |
| 225 | + check_sta_pkts "$mac" "$iface" |
| 226 | + ) & |
| 227 | + else |
| 228 | + # Same band or first association — sample traffic to track active state |
| 229 | + # Do this in background to not block the log reader |
| 230 | + ( |
| 231 | + sleep 1 |
| 232 | + update_active_state "$mac" "$iface" |
| 233 | + ) & |
| 234 | + fi |
| 235 | + ;; |
| 236 | + esac |
| 237 | +done |
0 commit comments