Skip to content

Commit c0427c6

Browse files
authored
Merge pull request #21550 from donaldsharp/startup_after_crash_issues
Startup after crash issues
2 parents 340906b + 42aa9f9 commit c0427c6

12 files changed

Lines changed: 482 additions & 21 deletions

File tree

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
!
2+
hostname r1
3+
!
4+
! Intentionally bring down nexthop group keep time
5+
! so that the test can finish in a reasonable amount of time
6+
zebra nexthop-group keep 5
7+
!
8+
interface r1-eth0
9+
ip address 192.168.1.1/24
10+
!
11+
interface r1-eth1
12+
ip address 192.168.2.1/24
13+
!
14+
interface r1-eth2
15+
ip address 192.168.3.1/24
16+
!
17+
nexthop-group twonhg
18+
nexthop 192.168.1.2 r1-eth0
19+
nexthop 192.168.2.2 r1-eth1
20+
!
21+
nexthop-group threenhg
22+
nexthop 192.168.1.2 r1-eth0
23+
nexthop 192.168.2.2 r1-eth1
24+
nexthop 192.168.3.2 r1-eth2
25+
!
26+
ip route 10.3.0.0/24 192.168.1.12
27+
ip route 10.3.0.0/24 192.168.2.22
28+
!
Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
#!/usr/bin/env python
2+
# SPDX-License-Identifier: ISC
3+
#
4+
# test_zebra_gr.py
5+
#
6+
# Copyright (c) 2026 by Nvidia Inc.
7+
# Donald Sharp
8+
#
9+
# Test that zebra properly reads kernel state on restart with -K and
10+
# sweeps stale routes after the graceful restart timer expires.
11+
#
12+
13+
"""
14+
test_zebra_gr.py: Test zebra graceful restart kernel route/NHG read-in and sweep.
15+
16+
Steps:
17+
1. Start zebra + sharpd + staticd.
18+
2. Have sharpd install routes with singleton nexthops and nexthop groups.
19+
3. Staticd installs a 2-way ECMP route via frr.conf.
20+
4. Verify routes/NHGs are present.
21+
5. Kill zebra (SIGKILL), leaving kernel state in place.
22+
6. Restart zebra with -K40.
23+
7. Verify kernel routes and NHGs are read back into zebra.
24+
8. Wait for the 40-second sweep timer to expire.
25+
9. Verify the stale routes and NHGs are cleaned up.
26+
10. Verify the static ECMP route survives the sweep (staticd reclaims it).
27+
"""
28+
29+
import os
30+
import sys
31+
import json
32+
from functools import partial
33+
34+
import pytest
35+
36+
CWD = os.path.dirname(os.path.realpath(__file__))
37+
sys.path.append(os.path.join(CWD, "../"))
38+
39+
from lib import topotest
40+
from lib.common_config import kill_router_daemons, start_router_daemons, step
41+
from lib.topogen import Topogen, TopoRouter, get_topogen
42+
from lib.topolog import logger
43+
44+
pytestmark = [pytest.mark.sharpd, pytest.mark.staticd]
45+
46+
GR_SWEEP_TIME = 40
47+
48+
49+
def setup_module(mod):
50+
topodef = {
51+
"s1": ("r1",),
52+
"s2": ("r1",),
53+
"s3": ("r1",),
54+
}
55+
tgen = Topogen(topodef, mod.__name__)
56+
tgen.start_topology()
57+
58+
router_list = tgen.routers()
59+
for rname, router in router_list.items():
60+
router.load_frr_config(
61+
os.path.join(CWD, "{}/frr.conf".format(rname)),
62+
extra_daemons=[
63+
(TopoRouter.RD_SHARP, ""),
64+
(TopoRouter.RD_STATIC, ""),
65+
],
66+
)
67+
68+
tgen.start_router()
69+
70+
71+
def teardown_module():
72+
tgen = get_topogen()
73+
tgen.stop_topology()
74+
75+
76+
def check_sharp_routes(r1, expected_count):
77+
"""Return None on match, or mismatch string."""
78+
output = json.loads(r1.vtysh_cmd("show ip route summary json"))
79+
for entry in output.get("routes", []):
80+
if entry.get("type") == "sharp" and entry.get("rib") == expected_count:
81+
return None
82+
return "Expected {} sharp routes, got: {}".format(
83+
expected_count, json.dumps(output)
84+
)
85+
86+
87+
def check_kernel_routes_present(r1, prefixes):
88+
"""Verify all prefixes exist as kernel routes in the RIB."""
89+
for pfx in prefixes:
90+
output = json.loads(r1.vtysh_cmd("show ip route {} json".format(pfx)))
91+
if pfx not in output:
92+
return "prefix {} not found in RIB".format(pfx)
93+
return None
94+
95+
96+
def check_kernel_routes_absent(r1, prefixes):
97+
"""Verify none of the prefixes exist in the RIB."""
98+
for pfx in prefixes:
99+
output = json.loads(r1.vtysh_cmd("show ip route {} json".format(pfx)))
100+
if pfx in output and len(output[pfx]) > 0:
101+
return "prefix {} still present in RIB".format(pfx)
102+
return None
103+
104+
105+
def test_zebra_gr_kernel_read_and_sweep():
106+
"""Test that zebra reads kernel routes on restart and sweeps them after GR timer."""
107+
tgen = get_topogen()
108+
if tgen.routers_have_failure():
109+
pytest.skip(tgen.errors)
110+
111+
r1 = tgen.gears["r1"]
112+
113+
# ---- Phase 1: Install routes via sharpd ----
114+
115+
step("Verify sharpd nexthop groups are installed in zebra RIB")
116+
117+
def _check_sharp_nhgs_installed():
118+
output = r1.vtysh_cmd("show nexthop-group rib sharp json", isjson=True)
119+
if not output or "default" not in output:
120+
return "No sharp NHG data found"
121+
vrf = output["default"]
122+
count = 0
123+
for nhg_id, nhg_data in vrf.items():
124+
if nhg_data.get("type") == "sharp" and nhg_data.get("installed"):
125+
count += 1
126+
if count != 5:
127+
return "Expected 5 installed sharp NHGs, found {}".format(count)
128+
return None
129+
130+
_, result = topotest.run_and_expect(
131+
_check_sharp_nhgs_installed, None, count=30, wait=1
132+
)
133+
assert result is None, result
134+
135+
step("Install 10 singleton nexthop routes via sharpd")
136+
r1.vtysh_cmd("sharp install routes 10.0.0.0 nexthop 192.168.1.2 10")
137+
138+
step("Install 10 routes via nexthop-group twonhg (2 nexthops)")
139+
r1.vtysh_cmd("sharp install routes 10.1.0.0 nexthop-group twonhg 10")
140+
141+
step("Install 10 routes via nexthop-group threenhg (3 nexthops)")
142+
r1.vtysh_cmd("sharp install routes 10.2.0.0 nexthop-group threenhg 10")
143+
144+
step("Verify 30 sharp routes are installed")
145+
test_func = partial(check_sharp_routes, r1, 30)
146+
_, result = topotest.run_and_expect(test_func, None, count=30, wait=1)
147+
assert result is None, "Sharp routes not installed: {}".format(result)
148+
149+
step("Verify static 2-way ECMP route is installed")
150+
151+
def _check_static_ecmp():
152+
output = json.loads(r1.vtysh_cmd("show ip route 10.3.0.0/24 json"))
153+
route_list = output.get("10.3.0.0/24", [])
154+
for route in route_list:
155+
if route.get("protocol") == "static":
156+
nhs = route.get("nexthops", [])
157+
if len(nhs) == 2:
158+
return None
159+
return "Static route has {} nexthops, expected 2".format(len(nhs))
160+
return "Static ECMP route 10.3.0.0/24 not found"
161+
162+
_, result = topotest.run_and_expect(_check_static_ecmp, None, count=30, wait=1)
163+
assert result is None, result
164+
165+
step("Verify all 30 sharp routes plus static route are in the kernel")
166+
expected_kernel_routes = (
167+
["10.0.0.{}".format(i) for i in range(0, 10)]
168+
+ ["10.1.0.{}".format(i) for i in range(0, 10)]
169+
+ ["10.2.0.{}".format(i) for i in range(0, 10)]
170+
+ ["10.3.0.0/24"]
171+
)
172+
173+
def _check_kernel_routes_installed():
174+
output = r1.run("ip route show")
175+
for route in expected_kernel_routes:
176+
if route not in output:
177+
return "route {} not found in kernel".format(route)
178+
return None
179+
180+
_, result = topotest.run_and_expect(
181+
_check_kernel_routes_installed, None, count=30, wait=1
182+
)
183+
assert result is None, "Kernel routes not installed: {}".format(result)
184+
185+
step("Record nexthop group IDs and their data before killing zebra")
186+
route_json = json.loads(r1.vtysh_cmd("show ip route json"))
187+
188+
singleton_nhg_id = route_json["10.0.0.0/32"][0]["nexthopGroupId"]
189+
twonhg_nhg_id = route_json["10.1.0.0/32"][0]["nexthopGroupId"]
190+
threenhg_nhg_id = route_json["10.2.0.0/32"][0]["nexthopGroupId"]
191+
static_nhg_id = route_json["10.3.0.0/24"][0]["nexthopGroupId"]
192+
193+
nhg_ids_before = {}
194+
for name, nhg_id in [
195+
("singleton", singleton_nhg_id),
196+
("twonhg", twonhg_nhg_id),
197+
("threenhg", threenhg_nhg_id),
198+
("static", static_nhg_id),
199+
]:
200+
nhg_json = json.loads(
201+
r1.vtysh_cmd("show nexthop-group rib {} json".format(nhg_id))
202+
)
203+
nhg_ids_before[name] = {
204+
"id": nhg_id,
205+
"data": nhg_json[str(nhg_id)],
206+
}
207+
208+
logger.info(
209+
"NHG IDs before kill: singleton=%d twonhg=%d threenhg=%d static=%d",
210+
singleton_nhg_id,
211+
twonhg_nhg_id,
212+
threenhg_nhg_id,
213+
static_nhg_id,
214+
)
215+
216+
assert (
217+
nhg_ids_before["singleton"]["data"]["nexthopCount"] == 1
218+
), "Singleton NHG should have 1 nexthop"
219+
assert (
220+
nhg_ids_before["twonhg"]["data"]["nexthopCount"] == 2
221+
), "twonhg NHG should have 2 nexthops"
222+
assert (
223+
nhg_ids_before["threenhg"]["data"]["nexthopCount"] == 3
224+
), "threenhg NHG should have 3 nexthops"
225+
assert (
226+
nhg_ids_before["static"]["data"]["nexthopCount"] == 2
227+
), "static NHG should have 2 nexthops"
228+
229+
# ---- Phase 2: Kill sharpd and zebra ----
230+
231+
step("Kill zebra - kernel routes remain in place")
232+
kill_router_daemons(tgen, "r1", ["zebra"], save_config=True)
233+
kill_router_daemons(tgen, "r1", ["sharpd"], save_config=True)
234+
kill_router_daemons(tgen, "r1", ["staticd"], save_config=True)
235+
236+
step("Verify routes are still in the kernel after zebra kill")
237+
output = r1.run("ip route show")
238+
assert "10.0.0.0" in output, "Singleton routes disappeared from kernel"
239+
assert "10.1.0.0" in output, "twonhg routes disappeared from kernel"
240+
assert "10.2.0.0" in output, "threenhg routes disappeared from kernel"
241+
assert "10.3.0.0/24" in output, "Static ECMP route disappeared from kernel"
242+
243+
# ---- Phase 3: Restart zebra with -K GR_SWEEP_TIME (no sharpd) ----
244+
245+
step("Restart zebra with -K {} (graceful restart)".format(GR_SWEEP_TIME))
246+
r1.net.daemons_options["zebra"] = "-K{}".format(GR_SWEEP_TIME)
247+
start_router_daemons(tgen, "r1", ["zebra", "sharpd", "staticd"])
248+
249+
step("Limit time that nexthop groups are kept around before the sweep happens")
250+
r1.vtysh_cmd("conf\nzebra nexthop-group keep 5")
251+
252+
step("Verify kernel routes are read back into zebra RIB")
253+
singleton_prefixes = ["10.0.0.{}/32".format(i) for i in range(0, 10)]
254+
twonhg_prefixes = ["10.1.0.{}/32".format(i) for i in range(0, 10)]
255+
threenhg_prefixes = ["10.2.0.{}/32".format(i) for i in range(0, 10)]
256+
all_prefixes = singleton_prefixes + twonhg_prefixes + threenhg_prefixes
257+
258+
test_func = partial(check_kernel_routes_present, r1, all_prefixes)
259+
_, result = topotest.run_and_expect(test_func, None, count=30, wait=1)
260+
assert result is None, "Routes not read back into zebra: {}".format(result)
261+
262+
step("Verify all 30 sharp routes are present as self-routes in zebra")
263+
test_func = partial(check_sharp_routes, r1, 30)
264+
_, result = topotest.run_and_expect(test_func, None, count=30, wait=1)
265+
assert result is None, "Sharp self-routes not read back: {}".format(result)
266+
267+
step("Verify static ECMP route is present after restart")
268+
_, result = topotest.run_and_expect(_check_static_ecmp, None, count=30, wait=1)
269+
assert result is None, "Static ECMP route not present after restart: {}".format(
270+
result
271+
)
272+
273+
step("Verify nexthop groups are read back with same IDs and data")
274+
275+
def _check_nhgs_match():
276+
for name, before in nhg_ids_before.items():
277+
nhg_id = before["id"]
278+
nhg_json = json.loads(
279+
r1.vtysh_cmd("show nexthop-group rib {} json".format(nhg_id))
280+
)
281+
nhg_id_str = str(nhg_id)
282+
if nhg_id_str not in nhg_json:
283+
return "NHG {} (id {}) not found after restart".format(name, nhg_id)
284+
285+
after = nhg_json[nhg_id_str]
286+
expected_count = before["data"]["nexthopCount"]
287+
actual_count = after.get("nexthopCount", 0)
288+
if actual_count != expected_count:
289+
return (
290+
"NHG {} (id {}) nexthopCount mismatch: expected {} got {}".format(
291+
name, nhg_id, expected_count, actual_count
292+
)
293+
)
294+
return None
295+
296+
_, result = topotest.run_and_expect(_check_nhgs_match, None, count=30, wait=1)
297+
assert result is None, result
298+
299+
# ---- Phase 4: Wait for sweep and verify cleanup ----
300+
step("Wait for GR sweep to complete ({} seconds)".format(GR_SWEEP_TIME))
301+
302+
def _check_sweep_completed():
303+
output = r1.vtysh_cmd("show zebra")
304+
if "RIB sweep happened at" in output:
305+
return None
306+
return "GR sweep has not completed yet"
307+
308+
_, result = topotest.run_and_expect(
309+
_check_sweep_completed, None, count=GR_SWEEP_TIME + 30, wait=1
310+
)
311+
assert result is None, result
312+
313+
step("Verify stale routes have been swept from zebra RIB")
314+
test_func = partial(check_kernel_routes_absent, r1, all_prefixes)
315+
_, result = topotest.run_and_expect(test_func, None, count=30, wait=1)
316+
assert result is None, "Stale routes not swept: {}".format(result)
317+
318+
step("Verify stale sharp routes are removed from kernel")
319+
output = r1.run("ip route show")
320+
assert "10.0.0.0" not in output, "Singleton routes still in kernel after sweep"
321+
assert "10.1.0.0" not in output, "twonhg routes still in kernel after sweep"
322+
assert "10.2.0.0" not in output, "threenhg routes still in kernel after sweep"
323+
324+
step("Verify static ECMP route survives the sweep (staticd reclaimed it)")
325+
_, result = topotest.run_and_expect(_check_static_ecmp, None, count=30, wait=1)
326+
assert result is None, "Static ECMP route lost after sweep: {}".format(result)
327+
328+
step("Verify sharp nexthop groups are removed after sweep")
329+
330+
def _check_sharp_nhgs_removed():
331+
for name in ("singleton", "twonhg", "threenhg"):
332+
nhg_id = nhg_ids_before[name]["id"]
333+
nhg_json = json.loads(
334+
r1.vtysh_cmd("show nexthop-group rib {} json".format(nhg_id))
335+
)
336+
if str(nhg_id) in nhg_json:
337+
return "NHG {} (id {}) still present after sweep".format(name, nhg_id)
338+
return None
339+
340+
_, result = topotest.run_and_expect(
341+
_check_sharp_nhgs_removed, None, count=15, wait=1
342+
)
343+
assert result is None, result
344+
345+
346+
def test_memory_leak():
347+
"Run the memory leak test and report results."
348+
tgen = get_topogen()
349+
if not tgen.is_memleak_enabled():
350+
pytest.skip("Memory leak test/report is disabled")
351+
352+
tgen.report_memory_leaks()
353+
354+
355+
if __name__ == "__main__":
356+
args = ["-s"] + sys.argv[1:]
357+
sys.exit(pytest.main(args))

zebra/if_netlink.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1483,7 +1483,7 @@ int netlink_link_change(struct nlmsghdr *h, ns_id_t ns_id, int startup)
14831483
dplane_ctx_set_ifp_zif_type(ctx, zif_type);
14841484
dplane_ctx_set_ifindex(ctx, ifi->ifi_index);
14851485
dplane_ctx_set_ifname(ctx, name);
1486-
dplane_ctx_set_ifp_startup(ctx, startup);
1486+
dplane_ctx_set_startup(ctx, startup);
14871487
dplane_ctx_set_ifp_family(ctx, ifi->ifi_family);
14881488
dplane_ctx_set_intf_txqlen(ctx, txqlen);
14891489

0 commit comments

Comments
 (0)