Skip to content

Commit f983b64

Browse files
authored
fix: graduate PET configure timeout to avoid killing process prematurely (Fixes #1274) (#1275)
Fix PET configure timeout killing the process prematurely, which prevents recovery on large workspaces. ## Changes - Introduce `RpcTimeoutError` class with a `method` property for type-safe timeout detection instead of fragile string matching - Graduate configure timeouts with exponential backoff on timeout duration: 1st attempt 30s, retry 60s, then kill — giving PET up to 90s total before restart - Move `lastConfiguration` cache write to after successful response — previously a failed configure would cache the options, causing subsequent calls to skip the configure request entirely - Clear `lastConfiguration` on any configure failure to ensure retries are not skipped by `configurationEquals` - Prevent double-kill: `resolve()` and `doRefresh()` no longer kill the process on configure timeouts (already handled inside `configure()`) - Reset `configureTimeoutCount` on successful configure and on process restart ## Root Cause When PET takes >30s to process configuration (e.g., large filesystem with glob patterns like `**/.venv` in dev containers), the extension immediately killed it and restarted. PET then had to re-process the same configuration from scratch, hitting the same timeout again, burning through all 3 restart attempts and leaving PET permanently dead. Fixes #1274 Fixes #1266
1 parent 6703525 commit f983b64

File tree

2 files changed

+222
-13
lines changed

2 files changed

+222
-13
lines changed

src/managers/common/nativePythonFinder.ts

Lines changed: 103 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,56 @@ const RESOLVE_TIMEOUT_MS = 30_000; // 30 seconds for single resolve
2323
// Restart/recovery constants
2424
const MAX_RESTART_ATTEMPTS = 3;
2525
const RESTART_BACKOFF_BASE_MS = 1_000; // 1 second base, exponential: 1s, 2s, 4s
26+
const MAX_CONFIGURE_TIMEOUTS_BEFORE_KILL = 2; // Kill on the 2nd consecutive timeout
27+
28+
/**
29+
* Computes the configure timeout with exponential backoff.
30+
* @param retryCount Number of consecutive configure timeouts so far
31+
* @returns Timeout in milliseconds: 30s, 60s, 120s, ... capped at REFRESH_TIMEOUT_MS
32+
*/
33+
export function getConfigureTimeoutMs(retryCount: number): number {
34+
return Math.min(CONFIGURE_TIMEOUT_MS * Math.pow(2, retryCount), REFRESH_TIMEOUT_MS);
35+
}
36+
37+
/**
38+
* Encapsulates the configure retry state machine.
39+
* Tracks consecutive timeout count and decides whether to kill the process.
40+
*/
41+
export class ConfigureRetryState {
42+
private _timeoutCount: number = 0;
43+
44+
get timeoutCount(): number {
45+
return this._timeoutCount;
46+
}
47+
48+
/** Returns the timeout duration for the current attempt (with exponential backoff). */
49+
getTimeoutMs(): number {
50+
return getConfigureTimeoutMs(this._timeoutCount);
51+
}
52+
53+
/** Call after a successful configure. Resets the timeout counter. */
54+
onSuccess(): void {
55+
this._timeoutCount = 0;
56+
}
57+
58+
/**
59+
* Call after a configure timeout. Increments the counter and returns
60+
* whether the process should be killed (true = kill, false = let it continue).
61+
*/
62+
onTimeout(): boolean {
63+
this._timeoutCount++;
64+
if (this._timeoutCount >= MAX_CONFIGURE_TIMEOUTS_BEFORE_KILL) {
65+
this._timeoutCount = 0;
66+
return true; // Kill the process
67+
}
68+
return false; // Let PET continue
69+
}
70+
71+
/** Call after a non-timeout error or process restart. Resets the counter. */
72+
reset(): void {
73+
this._timeoutCount = 0;
74+
}
75+
}
2676

2777
export async function getNativePythonToolsPath(): Promise<string> {
2878
const envsExt = getExtension(ENVS_EXTENSION_ID);
@@ -119,14 +169,27 @@ interface RefreshOptions {
119169
searchPaths?: string[];
120170
}
121171

172+
/**
173+
* Error thrown when a JSON-RPC request times out.
174+
*/
175+
export class RpcTimeoutError extends Error {
176+
constructor(
177+
public readonly method: string,
178+
timeoutMs: number,
179+
) {
180+
super(`Request '${method}' timed out after ${timeoutMs}ms`);
181+
this.name = this.constructor.name;
182+
}
183+
}
184+
122185
/**
123186
* Wraps a JSON-RPC sendRequest call with a timeout.
124187
* @param connection The JSON-RPC connection
125188
* @param method The RPC method name
126189
* @param params The parameters to send
127190
* @param timeoutMs Timeout in milliseconds
128191
* @returns The result of the request
129-
* @throws Error if the request times out
192+
* @throws RpcTimeoutError if the request times out
130193
*/
131194
async function sendRequestWithTimeout<T>(
132195
connection: rpc.MessageConnection,
@@ -138,7 +201,7 @@ async function sendRequestWithTimeout<T>(
138201
const timeoutPromise = new Promise<never>((_, reject) => {
139202
const timer = setTimeout(() => {
140203
cts.cancel();
141-
reject(new Error(`Request '${method}' timed out after ${timeoutMs}ms`));
204+
reject(new RpcTimeoutError(method, timeoutMs));
142205
}, timeoutMs);
143206
// Clear timeout if the CancellationTokenSource is disposed
144207
cts.token.onCancellationRequested(() => clearTimeout(timer));
@@ -161,6 +224,7 @@ class NativePythonFinderImpl implements NativePythonFinder {
161224
private startFailed: boolean = false;
162225
private restartAttempts: number = 0;
163226
private isRestarting: boolean = false;
227+
private readonly configureRetry = new ConfigureRetryState();
164228

165229
constructor(
166230
private readonly outputChannel: LogOutputChannel,
@@ -192,8 +256,9 @@ class NativePythonFinderImpl implements NativePythonFinder {
192256
this.restartAttempts = 0;
193257
return environment;
194258
} catch (ex) {
195-
// On timeout, kill the hung process so next request triggers restart
196-
if (ex instanceof Error && ex.message.includes('timed out')) {
259+
// On resolve timeout (not configure — configure handles its own timeout),
260+
// kill the hung process so next request triggers restart
261+
if (ex instanceof RpcTimeoutError && ex.method !== 'configure') {
197262
this.outputChannel.warn('[pet] Resolve request timed out, killing hung process for restart');
198263
this.killProcess();
199264
this.processExited = true;
@@ -260,6 +325,7 @@ class NativePythonFinderImpl implements NativePythonFinder {
260325
this.processExited = false;
261326
this.startFailed = false;
262327
this.lastConfiguration = undefined; // Force reconfiguration
328+
this.configureRetry.reset();
263329

264330
// Start fresh
265331
this.connection = this.start();
@@ -544,8 +610,9 @@ class NativePythonFinderImpl implements NativePythonFinder {
544610
// Reset restart attempts on successful refresh
545611
this.restartAttempts = 0;
546612
} catch (ex) {
547-
// On timeout, kill the hung process so next request triggers restart
548-
if (ex instanceof Error && ex.message.includes('timed out')) {
613+
// On refresh timeout (not configure — configure handles its own timeout),
614+
// kill the hung process so next request triggers restart
615+
if (ex instanceof RpcTimeoutError && ex.method !== 'configure') {
549616
this.outputChannel.warn('[pet] Request timed out, killing hung process for restart');
550617
this.killProcess();
551618
this.processExited = true;
@@ -583,17 +650,40 @@ class NativePythonFinderImpl implements NativePythonFinder {
583650
return;
584651
}
585652
this.outputChannel.info('[pet] configure: Sending configuration update:', JSON.stringify(options));
653+
// Exponential backoff: 30s, 60s on retry. Capped at REFRESH_TIMEOUT_MS.
654+
const timeoutMs = this.configureRetry.getTimeoutMs();
655+
if (this.configureRetry.timeoutCount > 0) {
656+
this.outputChannel.info(
657+
`[pet] configure: Using extended timeout of ${timeoutMs}ms (retry ${this.configureRetry.timeoutCount})`,
658+
);
659+
}
586660
try {
661+
await sendRequestWithTimeout(this.connection, 'configure', options, timeoutMs);
662+
// Only cache after success so failed/timed-out calls will retry
587663
this.lastConfiguration = options;
588-
await sendRequestWithTimeout(this.connection, 'configure', options, CONFIGURE_TIMEOUT_MS);
664+
this.configureRetry.onSuccess();
589665
} catch (ex) {
590-
// On timeout, kill the hung process so next request triggers restart
591-
if (ex instanceof Error && ex.message.includes('timed out')) {
592-
this.outputChannel.warn('[pet] Configure request timed out, killing hung process for restart');
593-
this.killProcess();
594-
this.processExited = true;
666+
// Clear cached config so the next call retries instead of short-circuiting via configurationEquals
667+
this.lastConfiguration = undefined;
668+
if (ex instanceof RpcTimeoutError) {
669+
const shouldKill = this.configureRetry.onTimeout();
670+
if (shouldKill) {
671+
this.outputChannel.error(
672+
'[pet] Configure timed out on consecutive attempts, killing hung process for restart',
673+
);
674+
this.killProcess();
675+
this.processExited = true;
676+
} else {
677+
this.outputChannel.warn(
678+
`[pet] Configure request timed out (attempt ${this.configureRetry.timeoutCount}/${MAX_CONFIGURE_TIMEOUTS_BEFORE_KILL}), ` +
679+
'will retry on next request without killing process',
680+
);
681+
}
682+
} else {
683+
// Non-timeout errors reset the counter so only consecutive timeouts are counted
684+
this.configureRetry.reset();
685+
this.outputChannel.error('[pet] configure: Configuration error', ex);
595686
}
596-
this.outputChannel.error('[pet] configure: Configuration error', ex);
597687
throw ex;
598688
}
599689
}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import assert from 'node:assert';
2+
import {
3+
ConfigureRetryState,
4+
RpcTimeoutError,
5+
getConfigureTimeoutMs,
6+
} from '../../../managers/common/nativePythonFinder';
7+
8+
suite('RpcTimeoutError', () => {
9+
test('has correct name property', () => {
10+
const error = new RpcTimeoutError('configure', 30000);
11+
assert.strictEqual(error.name, 'RpcTimeoutError');
12+
});
13+
14+
test('has correct method property', () => {
15+
const error = new RpcTimeoutError('configure', 30000);
16+
assert.strictEqual(error.method, 'configure');
17+
});
18+
19+
test('message includes method name and timeout', () => {
20+
const error = new RpcTimeoutError('resolve', 5000);
21+
assert.strictEqual(error.message, "Request 'resolve' timed out after 5000ms");
22+
});
23+
24+
test('is instanceof Error', () => {
25+
const error = new RpcTimeoutError('configure', 30000);
26+
assert.ok(error instanceof Error);
27+
assert.ok(error instanceof RpcTimeoutError);
28+
});
29+
});
30+
31+
suite('getConfigureTimeoutMs', () => {
32+
test('returns base timeout (30s) on first attempt', () => {
33+
assert.strictEqual(getConfigureTimeoutMs(0), 30_000);
34+
});
35+
36+
test('doubles timeout on first retry (60s)', () => {
37+
assert.strictEqual(getConfigureTimeoutMs(1), 60_000);
38+
});
39+
40+
test('doubles again on second retry (120s)', () => {
41+
assert.strictEqual(getConfigureTimeoutMs(2), 120_000);
42+
});
43+
44+
test('caps at REFRESH_TIMEOUT_MS (120s) for higher retries', () => {
45+
// 30_000 * 2^3 = 240_000, but capped at 120_000
46+
assert.strictEqual(getConfigureTimeoutMs(3), 120_000);
47+
assert.strictEqual(getConfigureTimeoutMs(10), 120_000);
48+
});
49+
});
50+
51+
suite('ConfigureRetryState', () => {
52+
let state: ConfigureRetryState;
53+
54+
setup(() => {
55+
state = new ConfigureRetryState();
56+
});
57+
58+
test('initial timeout count is 0', () => {
59+
assert.strictEqual(state.timeoutCount, 0);
60+
});
61+
62+
test('initial timeout is base (30s)', () => {
63+
assert.strictEqual(state.getTimeoutMs(), 30_000);
64+
});
65+
66+
test('onSuccess resets timeout count', () => {
67+
state.onTimeout(); // count = 1
68+
state.onSuccess();
69+
assert.strictEqual(state.timeoutCount, 0);
70+
assert.strictEqual(state.getTimeoutMs(), 30_000);
71+
});
72+
73+
test('first timeout does not kill (returns false)', () => {
74+
const shouldKill = state.onTimeout();
75+
assert.strictEqual(shouldKill, false);
76+
assert.strictEqual(state.timeoutCount, 1);
77+
});
78+
79+
test('first timeout increases next timeout to 60s', () => {
80+
state.onTimeout();
81+
assert.strictEqual(state.getTimeoutMs(), 60_000);
82+
});
83+
84+
test('second consecutive timeout kills (returns true)', () => {
85+
state.onTimeout(); // count = 1
86+
const shouldKill = state.onTimeout(); // count = 2 → kill → reset to 0
87+
assert.strictEqual(shouldKill, true);
88+
assert.strictEqual(state.timeoutCount, 0); // Reset after kill
89+
});
90+
91+
test('non-timeout error resets counter via reset()', () => {
92+
state.onTimeout(); // count = 1
93+
state.reset(); // simulates non-timeout error
94+
assert.strictEqual(state.timeoutCount, 0);
95+
// Next timeout should not trigger kill
96+
const shouldKill = state.onTimeout();
97+
assert.strictEqual(shouldKill, false);
98+
});
99+
100+
test('interleaved non-timeout error prevents premature kill', () => {
101+
state.onTimeout(); // count = 1
102+
state.reset(); // non-timeout error resets
103+
state.onTimeout(); // count = 1 again (not 2)
104+
assert.strictEqual(state.timeoutCount, 1);
105+
// Still shouldn't kill — only 1 consecutive timeout
106+
assert.strictEqual(state.getTimeoutMs(), 60_000);
107+
});
108+
109+
test('reset after kill allows fresh retry cycle', () => {
110+
state.onTimeout();
111+
state.onTimeout(); // kill → reset
112+
// Counter was reset by onTimeout when it returned true
113+
assert.strictEqual(state.timeoutCount, 0);
114+
assert.strictEqual(state.getTimeoutMs(), 30_000);
115+
// First timeout of new cycle should not kill
116+
const shouldKill = state.onTimeout();
117+
assert.strictEqual(shouldKill, false);
118+
});
119+
});

0 commit comments

Comments
 (0)