|
| 1 | +#!/usr/bin/env python |
| 2 | +# SPDX-License-Identifier: ISC |
| 3 | +# |
| 4 | +# test_zebra_gr.py |
| 5 | +# |
| 6 | +# Copyright (c) 2026 by Nvidia Inc. |
| 7 | +# Donald Sharp |
| 8 | +# |
| 9 | +# Test that zebra properly reads kernel state on restart with -K and |
| 10 | +# sweeps stale routes after the graceful restart timer expires. |
| 11 | +# |
| 12 | + |
| 13 | +""" |
| 14 | +test_zebra_gr.py: Test zebra graceful restart kernel route/NHG read-in and sweep. |
| 15 | +
|
| 16 | +Steps: |
| 17 | + 1. Start zebra + sharpd + staticd. |
| 18 | + 2. Have sharpd install routes with singleton nexthops and nexthop groups. |
| 19 | + 3. Staticd installs a 2-way ECMP route via frr.conf. |
| 20 | + 4. Verify routes/NHGs are present. |
| 21 | + 5. Kill zebra (SIGKILL), leaving kernel state in place. |
| 22 | + 6. Restart zebra with -K40. |
| 23 | + 7. Verify kernel routes and NHGs are read back into zebra. |
| 24 | + 8. Wait for the 40-second sweep timer to expire. |
| 25 | + 9. Verify the stale routes and NHGs are cleaned up. |
| 26 | + 10. Verify the static ECMP route survives the sweep (staticd reclaims it). |
| 27 | +""" |
| 28 | + |
| 29 | +import os |
| 30 | +import sys |
| 31 | +import json |
| 32 | +from functools import partial |
| 33 | + |
| 34 | +import pytest |
| 35 | + |
| 36 | +CWD = os.path.dirname(os.path.realpath(__file__)) |
| 37 | +sys.path.append(os.path.join(CWD, "../")) |
| 38 | + |
| 39 | +from lib import topotest |
| 40 | +from lib.common_config import kill_router_daemons, start_router_daemons, step |
| 41 | +from lib.topogen import Topogen, TopoRouter, get_topogen |
| 42 | +from lib.topolog import logger |
| 43 | + |
| 44 | +pytestmark = [pytest.mark.sharpd, pytest.mark.staticd] |
| 45 | + |
| 46 | +GR_SWEEP_TIME = 40 |
| 47 | + |
| 48 | + |
| 49 | +def setup_module(mod): |
| 50 | + topodef = { |
| 51 | + "s1": ("r1",), |
| 52 | + "s2": ("r1",), |
| 53 | + "s3": ("r1",), |
| 54 | + } |
| 55 | + tgen = Topogen(topodef, mod.__name__) |
| 56 | + tgen.start_topology() |
| 57 | + |
| 58 | + router_list = tgen.routers() |
| 59 | + for rname, router in router_list.items(): |
| 60 | + router.load_frr_config( |
| 61 | + os.path.join(CWD, "{}/frr.conf".format(rname)), |
| 62 | + extra_daemons=[ |
| 63 | + (TopoRouter.RD_SHARP, ""), |
| 64 | + (TopoRouter.RD_STATIC, ""), |
| 65 | + ], |
| 66 | + ) |
| 67 | + |
| 68 | + tgen.start_router() |
| 69 | + |
| 70 | + |
| 71 | +def teardown_module(): |
| 72 | + tgen = get_topogen() |
| 73 | + tgen.stop_topology() |
| 74 | + |
| 75 | + |
| 76 | +def check_sharp_routes(r1, expected_count): |
| 77 | + """Return None on match, or mismatch string.""" |
| 78 | + output = json.loads(r1.vtysh_cmd("show ip route summary json")) |
| 79 | + for entry in output.get("routes", []): |
| 80 | + if entry.get("type") == "sharp" and entry.get("rib") == expected_count: |
| 81 | + return None |
| 82 | + return "Expected {} sharp routes, got: {}".format( |
| 83 | + expected_count, json.dumps(output) |
| 84 | + ) |
| 85 | + |
| 86 | + |
| 87 | +def check_kernel_routes_present(r1, prefixes): |
| 88 | + """Verify all prefixes exist as kernel routes in the RIB.""" |
| 89 | + for pfx in prefixes: |
| 90 | + output = json.loads(r1.vtysh_cmd("show ip route {} json".format(pfx))) |
| 91 | + if pfx not in output: |
| 92 | + return "prefix {} not found in RIB".format(pfx) |
| 93 | + return None |
| 94 | + |
| 95 | + |
| 96 | +def check_kernel_routes_absent(r1, prefixes): |
| 97 | + """Verify none of the prefixes exist in the RIB.""" |
| 98 | + for pfx in prefixes: |
| 99 | + output = json.loads(r1.vtysh_cmd("show ip route {} json".format(pfx))) |
| 100 | + if pfx in output and len(output[pfx]) > 0: |
| 101 | + return "prefix {} still present in RIB".format(pfx) |
| 102 | + return None |
| 103 | + |
| 104 | + |
| 105 | +def test_zebra_gr_kernel_read_and_sweep(): |
| 106 | + """Test that zebra reads kernel routes on restart and sweeps them after GR timer.""" |
| 107 | + tgen = get_topogen() |
| 108 | + if tgen.routers_have_failure(): |
| 109 | + pytest.skip(tgen.errors) |
| 110 | + |
| 111 | + r1 = tgen.gears["r1"] |
| 112 | + |
| 113 | + # ---- Phase 1: Install routes via sharpd ---- |
| 114 | + |
| 115 | + step("Verify sharpd nexthop groups are installed in zebra RIB") |
| 116 | + |
| 117 | + def _check_sharp_nhgs_installed(): |
| 118 | + output = r1.vtysh_cmd("show nexthop-group rib sharp json", isjson=True) |
| 119 | + if not output or "default" not in output: |
| 120 | + return "No sharp NHG data found" |
| 121 | + vrf = output["default"] |
| 122 | + count = 0 |
| 123 | + for nhg_id, nhg_data in vrf.items(): |
| 124 | + if nhg_data.get("type") == "sharp" and nhg_data.get("installed"): |
| 125 | + count += 1 |
| 126 | + if count != 5: |
| 127 | + return "Expected 5 installed sharp NHGs, found {}".format(count) |
| 128 | + return None |
| 129 | + |
| 130 | + _, result = topotest.run_and_expect( |
| 131 | + _check_sharp_nhgs_installed, None, count=30, wait=1 |
| 132 | + ) |
| 133 | + assert result is None, result |
| 134 | + |
| 135 | + step("Install 10 singleton nexthop routes via sharpd") |
| 136 | + r1.vtysh_cmd("sharp install routes 10.0.0.0 nexthop 192.168.1.2 10") |
| 137 | + |
| 138 | + step("Install 10 routes via nexthop-group twonhg (2 nexthops)") |
| 139 | + r1.vtysh_cmd("sharp install routes 10.1.0.0 nexthop-group twonhg 10") |
| 140 | + |
| 141 | + step("Install 10 routes via nexthop-group threenhg (3 nexthops)") |
| 142 | + r1.vtysh_cmd("sharp install routes 10.2.0.0 nexthop-group threenhg 10") |
| 143 | + |
| 144 | + step("Verify 30 sharp routes are installed") |
| 145 | + test_func = partial(check_sharp_routes, r1, 30) |
| 146 | + _, result = topotest.run_and_expect(test_func, None, count=30, wait=1) |
| 147 | + assert result is None, "Sharp routes not installed: {}".format(result) |
| 148 | + |
| 149 | + step("Verify static 2-way ECMP route is installed") |
| 150 | + |
| 151 | + def _check_static_ecmp(): |
| 152 | + output = json.loads(r1.vtysh_cmd("show ip route 10.3.0.0/24 json")) |
| 153 | + route_list = output.get("10.3.0.0/24", []) |
| 154 | + for route in route_list: |
| 155 | + if route.get("protocol") == "static": |
| 156 | + nhs = route.get("nexthops", []) |
| 157 | + if len(nhs) == 2: |
| 158 | + return None |
| 159 | + return "Static route has {} nexthops, expected 2".format(len(nhs)) |
| 160 | + return "Static ECMP route 10.3.0.0/24 not found" |
| 161 | + |
| 162 | + _, result = topotest.run_and_expect(_check_static_ecmp, None, count=30, wait=1) |
| 163 | + assert result is None, result |
| 164 | + |
| 165 | + step("Verify all 30 sharp routes plus static route are in the kernel") |
| 166 | + expected_kernel_routes = ( |
| 167 | + ["10.0.0.{}".format(i) for i in range(0, 10)] |
| 168 | + + ["10.1.0.{}".format(i) for i in range(0, 10)] |
| 169 | + + ["10.2.0.{}".format(i) for i in range(0, 10)] |
| 170 | + + ["10.3.0.0/24"] |
| 171 | + ) |
| 172 | + |
| 173 | + def _check_kernel_routes_installed(): |
| 174 | + output = r1.run("ip route show") |
| 175 | + for route in expected_kernel_routes: |
| 176 | + if route not in output: |
| 177 | + return "route {} not found in kernel".format(route) |
| 178 | + return None |
| 179 | + |
| 180 | + _, result = topotest.run_and_expect( |
| 181 | + _check_kernel_routes_installed, None, count=30, wait=1 |
| 182 | + ) |
| 183 | + assert result is None, "Kernel routes not installed: {}".format(result) |
| 184 | + |
| 185 | + step("Record nexthop group IDs and their data before killing zebra") |
| 186 | + route_json = json.loads(r1.vtysh_cmd("show ip route json")) |
| 187 | + |
| 188 | + singleton_nhg_id = route_json["10.0.0.0/32"][0]["nexthopGroupId"] |
| 189 | + twonhg_nhg_id = route_json["10.1.0.0/32"][0]["nexthopGroupId"] |
| 190 | + threenhg_nhg_id = route_json["10.2.0.0/32"][0]["nexthopGroupId"] |
| 191 | + static_nhg_id = route_json["10.3.0.0/24"][0]["nexthopGroupId"] |
| 192 | + |
| 193 | + nhg_ids_before = {} |
| 194 | + for name, nhg_id in [ |
| 195 | + ("singleton", singleton_nhg_id), |
| 196 | + ("twonhg", twonhg_nhg_id), |
| 197 | + ("threenhg", threenhg_nhg_id), |
| 198 | + ("static", static_nhg_id), |
| 199 | + ]: |
| 200 | + nhg_json = json.loads( |
| 201 | + r1.vtysh_cmd("show nexthop-group rib {} json".format(nhg_id)) |
| 202 | + ) |
| 203 | + nhg_ids_before[name] = { |
| 204 | + "id": nhg_id, |
| 205 | + "data": nhg_json[str(nhg_id)], |
| 206 | + } |
| 207 | + |
| 208 | + logger.info( |
| 209 | + "NHG IDs before kill: singleton=%d twonhg=%d threenhg=%d static=%d", |
| 210 | + singleton_nhg_id, |
| 211 | + twonhg_nhg_id, |
| 212 | + threenhg_nhg_id, |
| 213 | + static_nhg_id, |
| 214 | + ) |
| 215 | + |
| 216 | + assert ( |
| 217 | + nhg_ids_before["singleton"]["data"]["nexthopCount"] == 1 |
| 218 | + ), "Singleton NHG should have 1 nexthop" |
| 219 | + assert ( |
| 220 | + nhg_ids_before["twonhg"]["data"]["nexthopCount"] == 2 |
| 221 | + ), "twonhg NHG should have 2 nexthops" |
| 222 | + assert ( |
| 223 | + nhg_ids_before["threenhg"]["data"]["nexthopCount"] == 3 |
| 224 | + ), "threenhg NHG should have 3 nexthops" |
| 225 | + assert ( |
| 226 | + nhg_ids_before["static"]["data"]["nexthopCount"] == 2 |
| 227 | + ), "static NHG should have 2 nexthops" |
| 228 | + |
| 229 | + # ---- Phase 2: Kill sharpd and zebra ---- |
| 230 | + |
| 231 | + step("Kill zebra - kernel routes remain in place") |
| 232 | + kill_router_daemons(tgen, "r1", ["zebra"], save_config=True) |
| 233 | + kill_router_daemons(tgen, "r1", ["sharpd"], save_config=True) |
| 234 | + kill_router_daemons(tgen, "r1", ["staticd"], save_config=True) |
| 235 | + |
| 236 | + step("Verify routes are still in the kernel after zebra kill") |
| 237 | + output = r1.run("ip route show") |
| 238 | + assert "10.0.0.0" in output, "Singleton routes disappeared from kernel" |
| 239 | + assert "10.1.0.0" in output, "twonhg routes disappeared from kernel" |
| 240 | + assert "10.2.0.0" in output, "threenhg routes disappeared from kernel" |
| 241 | + assert "10.3.0.0/24" in output, "Static ECMP route disappeared from kernel" |
| 242 | + |
| 243 | + # ---- Phase 3: Restart zebra with -K GR_SWEEP_TIME (no sharpd) ---- |
| 244 | + |
| 245 | + step("Restart zebra with -K {} (graceful restart)".format(GR_SWEEP_TIME)) |
| 246 | + r1.net.daemons_options["zebra"] = "-K{}".format(GR_SWEEP_TIME) |
| 247 | + start_router_daemons(tgen, "r1", ["zebra", "sharpd", "staticd"]) |
| 248 | + |
| 249 | + step("Limit time that nexthop groups are kept around before the sweep happens") |
| 250 | + r1.vtysh_cmd("conf\nzebra nexthop-group keep 5") |
| 251 | + |
| 252 | + step("Verify kernel routes are read back into zebra RIB") |
| 253 | + singleton_prefixes = ["10.0.0.{}/32".format(i) for i in range(0, 10)] |
| 254 | + twonhg_prefixes = ["10.1.0.{}/32".format(i) for i in range(0, 10)] |
| 255 | + threenhg_prefixes = ["10.2.0.{}/32".format(i) for i in range(0, 10)] |
| 256 | + all_prefixes = singleton_prefixes + twonhg_prefixes + threenhg_prefixes |
| 257 | + |
| 258 | + test_func = partial(check_kernel_routes_present, r1, all_prefixes) |
| 259 | + _, result = topotest.run_and_expect(test_func, None, count=30, wait=1) |
| 260 | + assert result is None, "Routes not read back into zebra: {}".format(result) |
| 261 | + |
| 262 | + step("Verify all 30 sharp routes are present as self-routes in zebra") |
| 263 | + test_func = partial(check_sharp_routes, r1, 30) |
| 264 | + _, result = topotest.run_and_expect(test_func, None, count=30, wait=1) |
| 265 | + assert result is None, "Sharp self-routes not read back: {}".format(result) |
| 266 | + |
| 267 | + step("Verify static ECMP route is present after restart") |
| 268 | + _, result = topotest.run_and_expect(_check_static_ecmp, None, count=30, wait=1) |
| 269 | + assert result is None, "Static ECMP route not present after restart: {}".format( |
| 270 | + result |
| 271 | + ) |
| 272 | + |
| 273 | + step("Verify nexthop groups are read back with same IDs and data") |
| 274 | + |
| 275 | + def _check_nhgs_match(): |
| 276 | + for name, before in nhg_ids_before.items(): |
| 277 | + nhg_id = before["id"] |
| 278 | + nhg_json = json.loads( |
| 279 | + r1.vtysh_cmd("show nexthop-group rib {} json".format(nhg_id)) |
| 280 | + ) |
| 281 | + nhg_id_str = str(nhg_id) |
| 282 | + if nhg_id_str not in nhg_json: |
| 283 | + return "NHG {} (id {}) not found after restart".format(name, nhg_id) |
| 284 | + |
| 285 | + after = nhg_json[nhg_id_str] |
| 286 | + expected_count = before["data"]["nexthopCount"] |
| 287 | + actual_count = after.get("nexthopCount", 0) |
| 288 | + if actual_count != expected_count: |
| 289 | + return ( |
| 290 | + "NHG {} (id {}) nexthopCount mismatch: expected {} got {}".format( |
| 291 | + name, nhg_id, expected_count, actual_count |
| 292 | + ) |
| 293 | + ) |
| 294 | + return None |
| 295 | + |
| 296 | + _, result = topotest.run_and_expect(_check_nhgs_match, None, count=30, wait=1) |
| 297 | + assert result is None, result |
| 298 | + |
| 299 | + # ---- Phase 4: Wait for sweep and verify cleanup ---- |
| 300 | + step("Wait for GR sweep to complete ({} seconds)".format(GR_SWEEP_TIME)) |
| 301 | + |
| 302 | + def _check_sweep_completed(): |
| 303 | + output = r1.vtysh_cmd("show zebra") |
| 304 | + if "RIB sweep happened at" in output: |
| 305 | + return None |
| 306 | + return "GR sweep has not completed yet" |
| 307 | + |
| 308 | + _, result = topotest.run_and_expect( |
| 309 | + _check_sweep_completed, None, count=GR_SWEEP_TIME + 30, wait=1 |
| 310 | + ) |
| 311 | + assert result is None, result |
| 312 | + |
| 313 | + step("Verify stale routes have been swept from zebra RIB") |
| 314 | + test_func = partial(check_kernel_routes_absent, r1, all_prefixes) |
| 315 | + _, result = topotest.run_and_expect(test_func, None, count=30, wait=1) |
| 316 | + assert result is None, "Stale routes not swept: {}".format(result) |
| 317 | + |
| 318 | + step("Verify stale sharp routes are removed from kernel") |
| 319 | + output = r1.run("ip route show") |
| 320 | + assert "10.0.0.0" not in output, "Singleton routes still in kernel after sweep" |
| 321 | + assert "10.1.0.0" not in output, "twonhg routes still in kernel after sweep" |
| 322 | + assert "10.2.0.0" not in output, "threenhg routes still in kernel after sweep" |
| 323 | + |
| 324 | + step("Verify static ECMP route survives the sweep (staticd reclaimed it)") |
| 325 | + _, result = topotest.run_and_expect(_check_static_ecmp, None, count=30, wait=1) |
| 326 | + assert result is None, "Static ECMP route lost after sweep: {}".format(result) |
| 327 | + |
| 328 | + step("Verify sharp nexthop groups are removed after sweep") |
| 329 | + |
| 330 | + def _check_sharp_nhgs_removed(): |
| 331 | + for name in ("singleton", "twonhg", "threenhg"): |
| 332 | + nhg_id = nhg_ids_before[name]["id"] |
| 333 | + nhg_json = json.loads( |
| 334 | + r1.vtysh_cmd("show nexthop-group rib {} json".format(nhg_id)) |
| 335 | + ) |
| 336 | + if str(nhg_id) in nhg_json: |
| 337 | + return "NHG {} (id {}) still present after sweep".format(name, nhg_id) |
| 338 | + return None |
| 339 | + |
| 340 | + _, result = topotest.run_and_expect( |
| 341 | + _check_sharp_nhgs_removed, None, count=15, wait=1 |
| 342 | + ) |
| 343 | + assert result is None, result |
| 344 | + |
| 345 | + |
| 346 | +def test_memory_leak(): |
| 347 | + "Run the memory leak test and report results." |
| 348 | + tgen = get_topogen() |
| 349 | + if not tgen.is_memleak_enabled(): |
| 350 | + pytest.skip("Memory leak test/report is disabled") |
| 351 | + |
| 352 | + tgen.report_memory_leaks() |
| 353 | + |
| 354 | + |
| 355 | +if __name__ == "__main__": |
| 356 | + args = ["-s"] + sys.argv[1:] |
| 357 | + sys.exit(pytest.main(args)) |
0 commit comments