Skip to content

Commit ba5ba4b

Browse files
committed
fix(sidecar): retry connect on Windows to fix startup race in named-pipe IPC
On Windows, a named-pipe client's CreateFileA fails immediately if the server has not yet called ConnectNamedPipe — there is no kernel-level connection backlog as Unix domain sockets provide. After daemonize() returns the child process exists but its Tokio runtime may not have entered the accept loop yet, causing test_ddog_sidecar_register_app to fail intermittently. Add connect_to_sidecar_with_retry() that retries the connection with exponential back-off (1 ms → 100 ms cap, up to 20 attempts ≈ 1.9 s total) when just_spawned is true. On non-Windows platforms is_pipe_not_ready() always returns false so the helper is a single-attempt no-op there. Remove the APMSP-2356 ignore annotation from test_ddog_sidecar_register_app so the fix is validated in CI.
1 parent 2cdcdcb commit ba5ba4b

File tree

2 files changed

+49
-9
lines changed

2 files changed

+49
-9
lines changed

datadog-sidecar-ffi/tests/sidecar.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,6 @@ fn test_ddog_sidecar_connection() {
6868
}
6969

7070
#[test]
71-
#[cfg_attr(
72-
target_os = "windows",
73-
ignore = "APMSP-2356 Investigate flakiness on Windows"
74-
)]
7571
#[cfg_attr(miri, ignore)]
7672
fn test_ddog_sidecar_register_app() {
7773
set_sidecar_per_process();

datadog-sidecar/src/entry.rs

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,50 @@ pub fn daemonize(listener: IpcServer, mut cfg: Config) -> anyhow::Result<()> {
274274
Ok(())
275275
}
276276

277+
/// Returns `true` for transient Windows named-pipe errors that indicate the server is not
278+
/// yet ready to accept connections (pipe instance doesn't exist yet, or all instances are
279+
/// busy). Always returns `false` on non-Windows so the retry loop is a no-op there.
280+
#[cfg(windows)]
281+
fn is_pipe_not_ready(e: &io::Error) -> bool {
282+
matches!(
283+
e.kind(),
284+
io::ErrorKind::NotFound
285+
| io::ErrorKind::ConnectionRefused
286+
| io::ErrorKind::WouldBlock
287+
)
288+
}
289+
290+
#[cfg(not(windows))]
291+
fn is_pipe_not_ready(_e: &io::Error) -> bool {
292+
false
293+
}
294+
295+
/// Connect to the sidecar, optionally retrying with exponential back-off.
296+
///
297+
/// On Windows, a named-pipe client's `CreateFileA` fails immediately if the server has not
298+
/// yet called `ConnectNamedPipe` — there is no kernel-level connection backlog like Unix
299+
/// domain sockets provide. When `just_spawned` is `true` (we just called `daemonize()`),
300+
/// we retry up to 20 times with doubling delays (1 ms → 100 ms cap, ≈ 1.9 s total budget)
301+
/// to give the child time to start its Tokio runtime and enter the accept loop.
302+
fn connect_to_sidecar_with_retry(
303+
liaison: &impl setup::Liaison,
304+
just_spawned: bool,
305+
) -> io::Result<SidecarTransport> {
306+
let max_attempts: u32 = if just_spawned { 20 } else { 1 };
307+
let mut delay = std::time::Duration::from_millis(1);
308+
for attempt in 0..max_attempts {
309+
match liaison.connect_to_server() {
310+
Ok(conn) => return Ok(SidecarTransport::from(conn)),
311+
Err(e) if attempt + 1 < max_attempts && is_pipe_not_ready(&e) => {
312+
std::thread::sleep(delay);
313+
delay = (delay * 2).min(std::time::Duration::from_millis(100));
314+
}
315+
Err(e) => return Err(e),
316+
}
317+
}
318+
liaison.connect_to_server().map(SidecarTransport::from)
319+
}
320+
277321
pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result<SidecarTransport> {
278322
// On Windows, named-pipe buffer sizes are fixed at creation time. Set the global before
279323
// attempt_listen so that the initial server pipe (created by this process and handed to the
@@ -289,18 +333,18 @@ pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result<SidecarTranspo
289333
config::IpcMode::InstancePerProcess => setup::DefaultLiason::ipc_per_process(),
290334
};
291335

336+
let mut just_spawned = false;
292337
let err = match liaison.attempt_listen() {
293338
Ok(Some(listener)) => {
294339
daemonize(listener, cfg)?;
340+
just_spawned = true;
295341
None
296342
}
297343
Ok(None) => None,
298344
err => err.context("Error starting sidecar").err(),
299345
};
300346

301-
Ok(SidecarTransport::from(
302-
liaison
303-
.connect_to_server()
304-
.map_err(|e| err.unwrap_or(e.into()))?,
305-
))
347+
connect_to_sidecar_with_retry(&liaison, just_spawned)
348+
.map_err(|e| err.unwrap_or(e.into()))
349+
.map_err(Into::into)
306350
}

0 commit comments

Comments
 (0)