From 26a67ce7277482d1a5614284324d06db9bb0e27d Mon Sep 17 00:00:00 2001 From: Dinko Dermendzhiev Date: Tue, 31 Mar 2026 18:47:53 -0400 Subject: [PATCH 1/4] setup-policy-routes: add sysfs wait timeout and stale lock detection --- bin/setup-policy-routes.sh | 5 +++++ lib/lib.sh | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/bin/setup-policy-routes.sh b/bin/setup-policy-routes.sh index 7c51089..747c8cb 100755 --- a/bin/setup-policy-routes.sh +++ b/bin/setup-policy-routes.sh @@ -50,12 +50,17 @@ refresh) start) register_networkd_reloader counter=0 + max_wait=3000 # 5 minute timeout to avoid infinite loop if sysfs node never appears while [ ! -e "/sys/class/net/${iface}" ]; do if ((counter % 1000 == 0)); then debug "Waiting for sysfs node to exist for ${iface} (iteration $counter)" fi sleep 0.1 ((counter++)) + if ((counter >= max_wait)); then + error "Timed out waiting for sysfs node for ${iface} after $((counter / 10)) seconds" + exit 1 + fi done debug "Starting configuration for $iface" debug /lib/systemd/systemd-networkd-wait-online -i "$iface" diff --git a/lib/lib.sh b/lib/lib.sh index efc2ef3..969e87f 100644 --- a/lib/lib.sh +++ b/lib/lib.sh @@ -631,6 +631,18 @@ register_networkd_reloader() { local -r lockfile="${lockdir}/${iface}" local old_opts=$- + # If the existing lock owner is no longer alive, remove the stale lockfile + # so subsequent invocations don't spin for up to 1000 seconds waiting on a + # process that will never release it. + if [ -f "${lockfile}" ]; then + local existing_pid + existing_pid=$(cat "${lockfile}" 2>/dev/null) + if [ -n "$existing_pid" ] && ! kill -0 "$existing_pid" 2>/dev/null; then + debug "Removing stale lock from dead process $existing_pid for ${iface}" + rm -f "${lockfile}" + fi + fi + # Disable -o errexit in the following block so we can capture # nonzero exit codes from a redirect without considering them # fatal errors From 9fad70be54220fa778b5ed79a929e8eb29289334 Mon Sep 17 00:00:00 2001 From: Dinko Dermendzhiev Date: Thu, 2 Apr 2026 13:49:37 -0400 Subject: [PATCH 2/4] falsy ((counter++)) needs || true when counter = 0 --- bin/setup-policy-routes.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/setup-policy-routes.sh b/bin/setup-policy-routes.sh index 747c8cb..20d8874 100755 --- a/bin/setup-policy-routes.sh +++ b/bin/setup-policy-routes.sh @@ -56,7 +56,7 @@ start) debug "Waiting for sysfs node to exist for ${iface} (iteration $counter)" fi sleep 0.1 - ((counter++)) + ((counter++)) || true if ((counter >= max_wait)); then error "Timed out waiting for sysfs node for ${iface} after $((counter / 10)) seconds" exit 1 From 097d289f1bc2a37b2d60399a30169e430f9f1903 Mon Sep 17 00:00:00 2001 From: Dinko Dermendzhiev Date: Thu, 2 Apr 2026 15:18:14 -0400 Subject: [PATCH 3/4] lib/lib.sh: lower register_networkd_reloader() max (spin time) to match sysfs wait timeout --- lib/lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lib.sh b/lib/lib.sh index 969e87f..b03a665 100644 --- a/lib/lib.sh +++ b/lib/lib.sh @@ -627,7 +627,7 @@ maybe_reload_networkd() { register_networkd_reloader() { local -i registered=1 cnt=0 - local -i max=10000 + local -i max=3000 # 300s (3000 × 0.1s); matches sysfs wait timeout in setup-policy-routes.sh local -r lockfile="${lockdir}/${iface}" local old_opts=$- From 65fbb0324da6faf4125dcaaecadb245e8f8285f3 Mon Sep 17 00:00:00 2001 From: Dinko Dermendzhiev Date: Fri, 3 Apr 2026 10:56:41 -0400 Subject: [PATCH 4/4] setup-policy-routes: disable refresh time and prevent restart using exit code 2 on timemout (ENI is invalid) --- bin/setup-policy-routes.sh | 3 ++- systemd/system/policy-routes@.service | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/setup-policy-routes.sh b/bin/setup-policy-routes.sh index 20d8874..b0e3b95 100755 --- a/bin/setup-policy-routes.sh +++ b/bin/setup-policy-routes.sh @@ -59,7 +59,8 @@ start) ((counter++)) || true if ((counter >= max_wait)); then error "Timed out waiting for sysfs node for ${iface} after $((counter / 10)) seconds" - exit 1 + /usr/bin/systemctl disable --now refresh-policy-routes@${iface}.timer 2>/dev/null || true + exit 2 fi done debug "Starting configuration for $iface" diff --git a/systemd/system/policy-routes@.service b/systemd/system/policy-routes@.service index 675e0e7..5093bbd 100644 --- a/systemd/system/policy-routes@.service +++ b/systemd/system/policy-routes@.service @@ -17,4 +17,5 @@ User=root ExecStart=/usr/bin/setup-policy-routes %i start Restart=on-failure RestartSec=1 +RestartPreventExitStatus=2 KillMode=process