|
| 1 | +#!/usr/bin/env python |
| 2 | +# SPDX-License-Identifier: ISC |
| 3 | + |
| 4 | +""" |
| 5 | +Regression test for stale BGP LLGR timers after peer AF deletion. |
| 6 | +
|
| 7 | +The test arms peer->t_llgr_stale[afi][safi], deletes the peer AF that was used |
| 8 | +as the timer callback argument, and verifies that bgpd survives beyond the |
| 9 | +original timer deadline. |
| 10 | +""" |
| 11 | + |
| 12 | +import functools |
| 13 | +import json |
| 14 | +import os |
| 15 | +import sys |
| 16 | +import time |
| 17 | + |
| 18 | +import pytest |
| 19 | + |
| 20 | +CWD = os.path.dirname(os.path.realpath(__file__)) |
| 21 | +sys.path.append(os.path.join(CWD, "../")) |
| 22 | + |
| 23 | +# pylint: disable=C0413 |
| 24 | +from lib import topotest |
| 25 | +from lib.common_config import kill_router_daemons, step |
| 26 | +from lib.topogen import Topogen, get_topogen |
| 27 | + |
| 28 | +pytestmark = [pytest.mark.bgpd] |
| 29 | + |
| 30 | +R1_PREFIX = "10.0.0.1/32" |
| 31 | +R2_AS = 65002 |
| 32 | +R2_NEIGHBOR = "192.168.255.1" |
| 33 | +LLGR_STALE_TIME = 10 |
| 34 | + |
| 35 | + |
| 36 | +def build_topo(tgen): |
| 37 | + for routern in range(1, 3): |
| 38 | + tgen.add_router("r{}".format(routern)) |
| 39 | + |
| 40 | + switch = tgen.add_switch("s1") |
| 41 | + switch.add_link(tgen.gears["r1"]) |
| 42 | + switch.add_link(tgen.gears["r2"]) |
| 43 | + |
| 44 | + |
| 45 | +def setup_module(mod): |
| 46 | + tgen = Topogen(build_topo, mod.__name__) |
| 47 | + tgen.start_topology() |
| 48 | + |
| 49 | + for rname, router in tgen.routers().items(): |
| 50 | + router.load_frr_config(os.path.join(CWD, "{}/frr.conf".format(rname))) |
| 51 | + |
| 52 | + tgen.start_router() |
| 53 | + |
| 54 | + |
| 55 | +def teardown_module(_mod): |
| 56 | + tgen = get_topogen() |
| 57 | + tgen.stop_topology() |
| 58 | + |
| 59 | + |
| 60 | +def _bgpd_alive(router): |
| 61 | + return ( |
| 62 | + router.cmd("test -d /proc/$(cat /var/run/frr/bgpd.pid) && echo alive || true") |
| 63 | + .strip() |
| 64 | + ) |
| 65 | + |
| 66 | + |
| 67 | +def _neighbor_json(router, neighbor): |
| 68 | + output = router.vtysh_cmd("show ip bgp neighbor {} json".format(neighbor)) |
| 69 | + return json.loads(output).get(neighbor, {}) |
| 70 | + |
| 71 | + |
| 72 | +def _prefix_json(router, prefix): |
| 73 | + output = router.vtysh_cmd("show ip bgp {} json".format(prefix)) |
| 74 | + return json.loads(output) |
| 75 | + |
| 76 | + |
| 77 | +def _route_observation(router, prefix): |
| 78 | + output = _prefix_json(router, prefix) |
| 79 | + paths = output.get("paths", []) |
| 80 | + first_path = paths[0] if paths else {} |
| 81 | + community = first_path.get("community", {}) if first_path else {} |
| 82 | + |
| 83 | + return { |
| 84 | + "bgpdAlive": _bgpd_alive(router), |
| 85 | + "present": bool(paths), |
| 86 | + "stale": first_path.get("stale"), |
| 87 | + "llgrSecondsRemaining": first_path.get("llgrSecondsRemaining"), |
| 88 | + "community": community.get("string"), |
| 89 | + } |
| 90 | + |
| 91 | + |
| 92 | +def test_bgp_llgr_stale_timer_cancelled_on_peer_af_delete(): |
| 93 | + """ |
| 94 | + Deleting a peer AF must cancel any LLGR stale timer using that peer_af. |
| 95 | +
|
| 96 | + Broken behavior: |
| 97 | + - GR helper mode arms peer->t_llgr_stale[afi][safi] with struct peer_af. |
| 98 | + - `no neighbor ... activate` deletes and frees that peer_af. |
| 99 | + - The stale timer later fires and dereferences the freed callback argument. |
| 100 | + """ |
| 101 | + tgen = get_topogen() |
| 102 | + |
| 103 | + if tgen.routers_have_failure(): |
| 104 | + pytest.skip(tgen.errors) |
| 105 | + |
| 106 | + r1 = tgen.gears["r1"] |
| 107 | + r2 = tgen.gears["r2"] |
| 108 | + |
| 109 | + def _r2_bgp_established(): |
| 110 | + neighbor = _neighbor_json(r2, R2_NEIGHBOR) |
| 111 | + if neighbor.get("bgpState") != "Established": |
| 112 | + return neighbor |
| 113 | + return None |
| 114 | + |
| 115 | + step("Wait for R2 BGP to establish") |
| 116 | + test_func = functools.partial(_r2_bgp_established) |
| 117 | + _, result = topotest.run_and_expect(test_func, None, count=60, wait=1) |
| 118 | + assert result is None, result |
| 119 | + |
| 120 | + def _r2_has_prefix(): |
| 121 | + obs = _route_observation(r2, R1_PREFIX) |
| 122 | + if not obs["present"]: |
| 123 | + return obs |
| 124 | + if obs["stale"]: |
| 125 | + return obs |
| 126 | + return None |
| 127 | + |
| 128 | + step("Wait for R2 to learn R1 prefix") |
| 129 | + test_func = functools.partial(_r2_has_prefix) |
| 130 | + _, result = topotest.run_and_expect(test_func, None, count=60, wait=1) |
| 131 | + assert result is None, result |
| 132 | + |
| 133 | + step("Stop R1 bgpd so R2 arms the LLGR stale timer") |
| 134 | + kill_router_daemons(tgen, "r1", ["bgpd"]) |
| 135 | + |
| 136 | + def _r2_llgr_timer_running(): |
| 137 | + obs = _route_observation(r2, R1_PREFIX) |
| 138 | + if not obs["present"]: |
| 139 | + return obs |
| 140 | + if obs["stale"] is not True: |
| 141 | + return obs |
| 142 | + if obs["community"] != "llgr-stale": |
| 143 | + return obs |
| 144 | + if ( |
| 145 | + obs["llgrSecondsRemaining"] is None |
| 146 | + or obs["llgrSecondsRemaining"] < 2 |
| 147 | + ): |
| 148 | + return obs |
| 149 | + return None |
| 150 | + |
| 151 | + test_func = functools.partial(_r2_llgr_timer_running) |
| 152 | + _, result = topotest.run_and_expect(test_func, None, count=40, wait=1) |
| 153 | + assert result is None, result |
| 154 | + |
| 155 | + step("Delete R2 IPv4 peer AF before the LLGR stale timer expires") |
| 156 | + r2.vtysh_cmd( |
| 157 | + """ |
| 158 | + configure terminal |
| 159 | + router bgp {} |
| 160 | + address-family ipv4 unicast |
| 161 | + no neighbor {} activate |
| 162 | + """.format( |
| 163 | + R2_AS, R2_NEIGHBOR |
| 164 | + ) |
| 165 | + ) |
| 166 | + |
| 167 | + deadline = time.monotonic() + LLGR_STALE_TIME + 2 |
| 168 | + |
| 169 | + def _r2_bgpd_alive_after_stale_timer_deadline(): |
| 170 | + if _bgpd_alive(r2) != "alive": |
| 171 | + return "r2 bgpd is not alive" |
| 172 | + |
| 173 | + remaining = deadline - time.monotonic() |
| 174 | + if remaining > 0: |
| 175 | + return "waiting {:.1f}s for stale LLGR timer deadline".format(remaining) |
| 176 | + |
| 177 | + return None |
| 178 | + |
| 179 | + step("Verify R2 bgpd stays alive past the stale LLGR timer deadline") |
| 180 | + test_func = functools.partial(_r2_bgpd_alive_after_stale_timer_deadline) |
| 181 | + _, result = topotest.run_and_expect( |
| 182 | + test_func, None, count=LLGR_STALE_TIME + 10, wait=1 |
| 183 | + ) |
| 184 | + assert result is None, result |
| 185 | + |
| 186 | + |
| 187 | +if __name__ == "__main__": |
| 188 | + args = ["-s"] + sys.argv[1:] |
| 189 | + sys.exit(pytest.main(args)) |
0 commit comments