Skip to content

Commit d9303ff

Browse files
committed
fix(mesh-store): degrade gracefully when coordinator port is held by orphan
Single-attempt connect instead of retry loop. Retrying tls.connect after a failed handshake to a non-TLS endpoint can freeze the event loop (Node.js TLS session cache bug). If the coordinator port is occupied but unresponsive, report via onError and continue without mesh. Also make init() idempotent and expose a connected getter.
1 parent 1133260 commit d9303ff

1 file changed

Lines changed: 40 additions & 38 deletions

File tree

src/core/mesh-store.ts

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,16 @@ export class MeshStore implements CommsStore {
7272
private peerInfo = new Map<string, PeerInfo>();
7373
private staleCheckTimer: ReturnType<typeof setInterval> | undefined;
7474
private isShutDown = false;
75+
private initialised = false;
7576
private pendingMarkReadTimers: ReturnType<typeof setTimeout>[] = [];
7677

78+
/** Whether the mesh has a live coordinator connection. */
79+
get connected(): boolean {
80+
return (
81+
this.transport.isCoordinator || this.transport.hasCoordinatorConnection
82+
);
83+
}
84+
7785
// -- Pending inbound connections awaiting approval --
7886
private pendingInboundConnections = new Map<
7987
string,
@@ -146,6 +154,8 @@ export class MeshStore implements CommsStore {
146154
// -----------------------------------------------------------------------
147155

148156
async init(): Promise<void> {
157+
if (this.initialised) return;
158+
this.initialised = true;
149159
await this.transport.startDataServer();
150160

151161
// Register our own peer info
@@ -156,57 +166,49 @@ export class MeshStore implements CommsStore {
156166
});
157167

158168
// Try joining an existing mesh; fall back to becoming coordinator.
159-
// If becomeCoordinator fails with EADDRINUSE (another process won the race),
160-
// retry connecting — the new coordinator should be ready by now.
161-
const MAX_RETRIES = 3;
162-
const RETRY_DELAY_MS = 200;
169+
//
170+
// Single attempt: connect to an existing coordinator, or become one.
171+
// If the coordinator port is occupied but unresponsive (e.g. an orphan
172+
// process from a previous session), degrade gracefully instead of
173+
// retrying. Retrying tls.connect after a failed handshake to a
174+
// non-TLS endpoint can freeze the event loop (Node.js TLS session
175+
// cache bug), so we only try once.
163176
let connected = false;
164-
let lastError: Error | undefined;
165177

166-
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
178+
try {
179+
await this.transport.connectToCoordinator(
180+
COORDINATOR_HOST,
181+
this.coordinatorPort,
182+
this.peerId,
183+
this.transport.dataPort,
184+
);
185+
connected = true;
186+
} catch {
167187
try {
168-
await this.transport.connectToCoordinator(
188+
await this.transport.becomeCoordinator(
169189
COORDINATOR_HOST,
170190
this.coordinatorPort,
171-
this.peerId,
172-
this.transport.dataPort,
173191
);
192+
this.startStaleCheck();
174193
connected = true;
175-
break;
176-
} catch (err) {
177-
lastError = err instanceof Error ? err : new Error(String(err));
178-
// Only try to become coordinator on the first attempt
179-
if (attempt === 0) {
180-
try {
181-
await this.transport.becomeCoordinator(
182-
COORDINATOR_HOST,
183-
this.coordinatorPort,
184-
);
185-
this.startStaleCheck();
186-
connected = true;
187-
break;
188-
} catch (coordErr) {
189-
const msg =
190-
coordErr instanceof Error ? coordErr.message : String(coordErr);
191-
if (!msg.includes("EADDRINUSE")) {
192-
throw coordErr;
193-
}
194-
// EADDRINUSE — another process became coordinator. Retry connect.
195-
}
196-
}
197-
// Wait before retrying
198-
if (attempt < MAX_RETRIES - 1) {
199-
await new Promise<void>((resolve) =>
200-
setTimeout(resolve, RETRY_DELAY_MS),
201-
);
194+
} catch (coordErr) {
195+
const msg =
196+
coordErr instanceof Error ? coordErr.message : String(coordErr);
197+
if (!msg.includes("EADDRINUSE")) {
198+
throw coordErr;
202199
}
200+
// EADDRINUSE — port held by an unresponsive process. Degrade.
203201
}
204202
}
205203

206204
if (!connected) {
207-
throw new Error(
208-
`Failed to join or create mesh on port ${String(this.coordinatorPort)}: ${lastError?.message ?? "unknown error"}`,
205+
this.events.onError?.(
206+
new Error(
207+
`MeshStore: could not join or create mesh on port ${String(this.coordinatorPort)}. ` +
208+
"Running without mesh — agent-comms will be unavailable.",
209+
),
209210
);
211+
return;
210212
}
211213

212214
this.transport.unref();

0 commit comments

Comments
 (0)