|
| 1 | +// |
| 2 | +// Copyright (c) 2026 Steve Gerbino |
| 3 | +// |
| 4 | +// Distributed under the Boost Software License, Version 1.0. (See accompanying |
| 5 | +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| 6 | +// |
| 7 | +// Official repository: https://github.com/cppalliance/corosio |
| 8 | +// |
| 9 | + |
| 10 | += Reconnect with Exponential Backoff |
| 11 | + |
| 12 | +This tutorial builds a TCP client that connects to a server and automatically |
| 13 | +reconnects with exponential backoff when the connection fails. You'll learn |
| 14 | +how to combine timers with sockets for retry logic and how to use stop tokens |
| 15 | +for graceful shutdown. |
| 16 | + |
| 17 | +NOTE: Code snippets assume: |
| 18 | +[source,cpp] |
| 19 | +---- |
| 20 | +#include <boost/corosio/endpoint.hpp> |
| 21 | +#include <boost/corosio/io_context.hpp> |
| 22 | +#include <boost/corosio/tcp_socket.hpp> |
| 23 | +#include <boost/corosio/timer.hpp> |
| 24 | +#include <boost/capy/buffers.hpp> |
| 25 | +#include <boost/capy/cond.hpp> |
| 26 | +#include <boost/capy/ex/run_async.hpp> |
| 27 | +#include <boost/capy/task.hpp> |
| 28 | +
|
| 29 | +namespace corosio = boost::corosio; |
| 30 | +namespace capy = boost::capy; |
| 31 | +---- |
| 32 | + |
| 33 | +== Overview |
| 34 | + |
| 35 | +Client applications often need to maintain a persistent connection to a server. |
| 36 | +When the server is temporarily unavailable — during a restart, a network blip, |
| 37 | +or a deployment — the client should retry rather than give up immediately. |
| 38 | +Retrying too aggressively wastes resources and can overwhelm a recovering |
| 39 | +server, so the delay between attempts should grow over time. |
| 40 | + |
| 41 | +Exponential backoff solves this: start with a short delay, double it on each |
| 42 | +failure, and cap it at a maximum. This gives fast recovery when the outage is |
| 43 | +brief and backs off gracefully when it isn't. |
| 44 | + |
| 45 | +This tutorial demonstrates: |
| 46 | + |
| 47 | +* Separating the backoff _policy_ (pure state) from the _mechanism_ (timer wait) |
| 48 | +* Using `timer` for inter-attempt delays |
| 49 | +* Graceful cancellation via stop tokens |
| 50 | +* Why `io_context::stop()` alone is not sufficient for coroutine shutdown |
| 51 | + |
| 52 | +== The Backoff Policy |
| 53 | + |
| 54 | +The delay logic is pure computation — no I/O, no coroutines. A simple value |
| 55 | +type tracks the current delay, doubles it on each call, and caps it at a |
| 56 | +configured maximum: |
| 57 | + |
| 58 | +[source,cpp] |
| 59 | +---- |
| 60 | +struct exponential_backoff |
| 61 | +{ |
| 62 | + using duration = std::chrono::milliseconds; |
| 63 | +
|
| 64 | +private: |
| 65 | + duration initial_; |
| 66 | + duration delay_; |
| 67 | + duration max_; |
| 68 | +
|
| 69 | +public: |
| 70 | + exponential_backoff(duration initial, duration max) noexcept |
| 71 | + : initial_(initial) |
| 72 | + , delay_(initial) |
| 73 | + , max_(max) |
| 74 | + { |
| 75 | + } |
| 76 | +
|
| 77 | + /// Return the current delay and advance to the next. |
| 78 | + duration next() noexcept |
| 79 | + { |
| 80 | + auto current = (std::min)(delay_, max_); |
| 81 | + delay_ = (std::min)(delay_ * 2, max_); |
| 82 | + return current; |
| 83 | + } |
| 84 | +
|
| 85 | + /// Restart the sequence from the initial delay. |
| 86 | + void reset() noexcept |
| 87 | + { |
| 88 | + delay_ = initial_; |
| 89 | + } |
| 90 | +}; |
| 91 | +---- |
| 92 | + |
| 93 | +With an initial delay of 500ms and a 30s cap, calling `next()` produces: |
| 94 | +500, 1000, 2000, 4000, 8000, 16000, 30000, 30000, ... |
| 95 | + |
| 96 | +Keeping the policy separate from the timer means it can be reused in any |
| 97 | +context — synchronous retries, tests, or logging — without pulling in |
| 98 | +async machinery. |
| 99 | + |
| 100 | +== Session Coroutine |
| 101 | + |
| 102 | +Once connected, the client reads data until the peer disconnects: |
| 103 | + |
| 104 | +[source,cpp] |
| 105 | +---- |
| 106 | +capy::task<> |
| 107 | +do_session(corosio::tcp_socket& sock) |
| 108 | +{ |
| 109 | + char buf[4096]; |
| 110 | + for (;;) |
| 111 | + { |
| 112 | + auto [ec, n] = |
| 113 | + co_await sock.read_some(capy::mutable_buffer(buf, sizeof buf)); |
| 114 | + if (ec) |
| 115 | + break; |
| 116 | + std::cout.write(buf, static_cast<std::streamsize>(n)); |
| 117 | + std::cout.flush(); |
| 118 | + } |
| 119 | +} |
| 120 | +---- |
| 121 | + |
| 122 | +This is the same read loop you would find in any echo client. The interesting |
| 123 | +part is what happens after it returns — the caller reconnects. |
| 124 | + |
| 125 | +== Reconnection Loop |
| 126 | + |
| 127 | +The retry loop ties everything together. On each failed connection it asks the |
| 128 | +backoff policy for the next delay, waits on a timer, and tries again: |
| 129 | + |
| 130 | +[source,cpp] |
| 131 | +---- |
| 132 | +capy::task<> |
| 133 | +connect_with_backoff( |
| 134 | + corosio::io_context& ioc, |
| 135 | + corosio::endpoint ep, |
| 136 | + exponential_backoff backoff, |
| 137 | + int max_attempts) |
| 138 | +{ |
| 139 | + corosio::tcp_socket sock(ioc); |
| 140 | + corosio::timer delay(ioc); |
| 141 | + int attempt = 0; |
| 142 | +
|
| 143 | + for (;;) |
| 144 | + { |
| 145 | + ++attempt; |
| 146 | +
|
| 147 | + auto [ec] = co_await sock.connect(ep); |
| 148 | + if (!ec) |
| 149 | + { |
| 150 | + std::cout << "Connected on attempt " << attempt << std::endl; |
| 151 | + co_await do_session(sock); |
| 152 | +
|
| 153 | + // Peer disconnected — restart the retry sequence |
| 154 | + sock.close(); |
| 155 | + backoff.reset(); |
| 156 | + attempt = 0; |
| 157 | + continue; |
| 158 | + } |
| 159 | +
|
| 160 | + sock.close(); |
| 161 | +
|
| 162 | + if (max_attempts > 0 && attempt >= max_attempts) |
| 163 | + co_return; |
| 164 | +
|
| 165 | + auto wait_for = backoff.next(); |
| 166 | +
|
| 167 | + delay.expires_after(wait_for); |
| 168 | + auto [timer_ec] = co_await delay.wait(); |
| 169 | + if (timer_ec == capy::cond::canceled) |
| 170 | + co_return; |
| 171 | +
|
| 172 | + // delay doubles automatically via backoff.next() |
| 173 | + } |
| 174 | +} |
| 175 | +---- |
| 176 | + |
| 177 | +There are two exit conditions: |
| 178 | + |
| 179 | +1. **Max attempts exhausted** — the coroutine gives up. |
| 180 | +2. **Timer cancelled** — someone signaled the stop token, requesting graceful |
| 181 | + shutdown. The coroutine unwinds through normal control flow. |
| 182 | + |
| 183 | +After a successful connection and subsequent disconnect, `backoff.reset()` |
| 184 | +restarts the delay sequence from the initial value. |
| 185 | + |
| 186 | +== Graceful Shutdown with Stop Tokens |
| 187 | + |
| 188 | +The key insight of this tutorial: **`io_context::stop()` does not cancel |
| 189 | +pending operations.** It only stops the event loop. Suspended coroutines are |
| 190 | +left in place and destroyed during `~io_context` without ever observing an |
| 191 | +error. This is by design — `stop()` is a _pause_ that preserves state for |
| 192 | +a potential `restart()`. |
| 193 | + |
| 194 | +For graceful shutdown where coroutines unwind through their own control flow, |
| 195 | +use a stop token: |
| 196 | + |
| 197 | +[source,cpp] |
| 198 | +---- |
| 199 | +std::stop_source stop_src; |
| 200 | +
|
| 201 | +capy::run_async(ioc.get_executor(), stop_src.get_token())( |
| 202 | + connect_with_backoff(ioc, ep, backoff, 10)); |
| 203 | +
|
| 204 | +// Later, from any thread: |
| 205 | +stop_src.request_stop(); |
| 206 | +---- |
| 207 | + |
| 208 | +When the stop source is signaled: |
| 209 | + |
| 210 | +1. The timer's `wait()` returns `cond::canceled`. |
| 211 | +2. The coroutine checks the error and executes `co_return`. |
| 212 | +3. Local variables (`sock`, `delay`) are destroyed through normal unwinding. |
| 213 | +4. With no more outstanding work, `run()` returns. |
| 214 | +5. `~io_context` finds an empty heap — nothing to clean up. |
| 215 | + |
| 216 | +Contrast with calling `stop()` directly: |
| 217 | + |
| 218 | +1. `run()` exits immediately. |
| 219 | +2. The coroutine remains suspended — it never sees an error. |
| 220 | +3. `~io_context` calls `h.destroy()` on the coroutine frame, bypassing its |
| 221 | + error-handling logic. |
| 222 | + |
| 223 | +Both paths are safe (no leaks or crashes), but only the stop token path |
| 224 | +executes the coroutine's own cleanup code. |
| 225 | + |
| 226 | +[cols="1,1,1"] |
| 227 | +|=== |
| 228 | +| Mechanism | Coroutine sees cancellation? | Use case |
| 229 | + |
| 230 | +| `stop_token` |
| 231 | +| Yes — operations return `cond::canceled` |
| 232 | +| Graceful shutdown |
| 233 | + |
| 234 | +| `stop()` + `restart()` |
| 235 | +| No — coroutines stay suspended |
| 236 | +| Pause and resume the event loop |
| 237 | + |
| 238 | +| `~io_context` |
| 239 | +| No — frames destroyed via `h.destroy()` |
| 240 | +| Final cleanup (after `stop()` or natural exit) |
| 241 | +|=== |
| 242 | + |
| 243 | +== Main Function |
| 244 | + |
| 245 | +[source,cpp] |
| 246 | +---- |
| 247 | +int main(int argc, char* argv[]) |
| 248 | +{ |
| 249 | + if (argc != 3) |
| 250 | + { |
| 251 | + std::cerr << "Usage: reconnect <ip-address> <port>\n"; |
| 252 | + return EXIT_FAILURE; |
| 253 | + } |
| 254 | +
|
| 255 | + corosio::ipv4_address addr; |
| 256 | + if (auto ec = corosio::parse_ipv4_address(argv[1], addr); ec) |
| 257 | + { |
| 258 | + std::cerr << "Invalid IP address: " << argv[1] << "\n"; |
| 259 | + return EXIT_FAILURE; |
| 260 | + } |
| 261 | +
|
| 262 | + auto port = static_cast<std::uint16_t>(std::atoi(argv[2])); |
| 263 | +
|
| 264 | + corosio::io_context ioc; |
| 265 | +
|
| 266 | + using namespace std::chrono_literals; |
| 267 | + exponential_backoff backoff(500ms, 30s); |
| 268 | +
|
| 269 | + std::stop_source stop_src; |
| 270 | +
|
| 271 | + capy::run_async(ioc.get_executor(), stop_src.get_token())( |
| 272 | + connect_with_backoff(ioc, corosio::endpoint(addr, port), backoff, 10)); |
| 273 | +
|
| 274 | + // Run the event loop on a background thread so main |
| 275 | + // can signal cancellation after a timeout. |
| 276 | + auto worker = std::jthread([&ioc] { ioc.run(); }); |
| 277 | +
|
| 278 | + std::this_thread::sleep_for(5s); |
| 279 | + stop_src.request_stop(); |
| 280 | +} |
| 281 | +---- |
| 282 | + |
| 283 | +The event loop runs on a background thread. After five seconds the main thread |
| 284 | +signals cancellation. The coroutine observes `cond::canceled`, unwinds, the |
| 285 | +work count reaches zero, and `run()` returns. The `jthread` destructor joins |
| 286 | +automatically. |
| 287 | + |
| 288 | +== Testing |
| 289 | + |
| 290 | +Start an echo server on one terminal: |
| 291 | + |
| 292 | +[source,bash] |
| 293 | +---- |
| 294 | +$ ./echo_server 8080 10 |
| 295 | +Echo server listening on port 8080 with 10 workers |
| 296 | +---- |
| 297 | + |
| 298 | +Run the reconnect client on another: |
| 299 | + |
| 300 | +[source,bash] |
| 301 | +---- |
| 302 | +$ ./reconnect 127.0.0.1 8080 |
| 303 | +Connected on attempt 1 |
| 304 | +---- |
| 305 | + |
| 306 | +Stop the server and watch the client retry: |
| 307 | + |
| 308 | +[source,bash] |
| 309 | +---- |
| 310 | +Attempt 1 failed: Connection refused |
| 311 | +Retrying in 500ms |
| 312 | +Attempt 2 failed: Connection refused |
| 313 | +Retrying in 1000ms |
| 314 | +Attempt 3 failed: Connection refused |
| 315 | +Retrying in 2000ms |
| 316 | +---- |
| 317 | + |
| 318 | +Restart the server — the client reconnects on the next attempt. |
| 319 | + |
| 320 | +To test the no-server case, point the client at a port with nothing listening: |
| 321 | + |
| 322 | +[source,bash] |
| 323 | +---- |
| 324 | +$ ./reconnect 127.0.0.1 19999 |
| 325 | +Attempt 1 failed: Connection refused |
| 326 | +Retrying in 500ms |
| 327 | +Attempt 2 failed: Connection refused |
| 328 | +Retrying in 1000ms |
| 329 | +... |
| 330 | +Retry cancelled |
| 331 | +---- |
| 332 | + |
| 333 | +After five seconds the stop token fires and the client exits cleanly. |
| 334 | + |
| 335 | +== Next Steps |
| 336 | + |
| 337 | +* xref:../4.guide/4h.timers.adoc[Timers Guide] — Timer operations in detail |
| 338 | +* xref:../4.guide/4d.sockets.adoc[Sockets Guide] — Socket operations and error handling |
| 339 | +* xref:../4.guide/4c.io-context.adoc[I/O Context Guide] — Event loop mechanics, `stop()`, and `restart()` |
| 340 | +* xref:../4.guide/4m.error-handling.adoc[Error Handling] — Portable error conditions and `cond` |
0 commit comments