Skip to content

Commit 4f765e4

Browse files
feat: add triggerReason to pet.process_restart and globalScopeDeferred to setup.hang_detected (#1508)
- pet.process_restart now emits triggerReason (process_exit:<code>:<signal>, process_error, rpc_connection_error, rpc_refresh_timeout, rpc_resolve_timeout, rpc_configure_timeout, start_failed, unknown) so we can distinguish crash restarts from timeout restarts. - setup.hang_detected now emits globalScopeDeferred (true/false/undefined) so we can tell whether a watchdog trip was a real foreground hang or a background global-scope scan running past 120s after the user already had an env selected Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 33ab77e commit 4f765e4

4 files changed

Lines changed: 256 additions & 27 deletions

File tree

src/common/telemetry/constants.ts

Lines changed: 77 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,12 @@ export enum EventNames {
6060
*/
6161
MANAGER_REGISTRATION_FAILED = 'MANAGER_REGISTRATION.FAILED',
6262
/**
63-
* Telemetry event fired when the setup block appears to be hung.
64-
* A watchdog timer fires after a deadline; if the setup completes normally,
65-
* the timer is cancelled and this event never fires.
63+
* Watchdog event fired when setup appears hung. Cancelled if setup completes normally.
6664
* Properties:
67-
* - failureStage: string (which phase was in progress when the watchdog fired)
65+
* - failureStage: which phase was in progress when the watchdog fired
66+
* - globalScopeDeferred: distinguishes a real foreground hang from the benign 120s
67+
* background global-scope scan. String union (not boolean) because the sender drops
68+
* undefined. Values: 'deferred' | 'not_deferred' | 'unknown'.
6869
* Measures:
6970
* - duration: total elapsed since activation
7071
* - stageDuration: elapsed in the current stage
@@ -86,6 +87,14 @@ export enum EventNames {
8687
* - errorType: string (classified error category, on failure only)
8788
*/
8889
PET_INIT_DURATION = 'PET.INIT_DURATION',
90+
/**
91+
* Fired once per activation. Lets us slice every other PET event by which binary
92+
* version was running — important since PET evolves independently of the extension.
93+
* Properties:
94+
* - version: output of `pet --version` (e.g. '0.1.0'), or 'unknown' on failure/timeout
95+
* - source: 'envs_extension' | 'python_extension' — which extension shipped the binary
96+
*/
97+
PET_VERSION = 'PET.VERSION',
8998
/**
9099
* Telemetry event fired when applyInitialEnvironmentSelection begins.
91100
* Signals that all managers are registered and env selection is starting.
@@ -146,12 +155,16 @@ export enum EventNames {
146155
* Telemetry event for a PET refresh attempt (the core discovery RPC call).
147156
* Properties:
148157
* - result: 'success' | 'timeout' | 'error'
149-
* - envCount: number (environments returned via notifications)
150-
* - unresolvedCount: number (envs that needed follow-up resolve calls)
151-
* - workspaceDirCount: number (workspace directories sent in configure)
152-
* - searchPathCount: number (extra search paths sent in configure)
153-
* - attempt: number (0 = first try, 1 = retry)
154-
* - errorType: string (classified error category, on failure only)
158+
* - envCount, unresolvedCount, workspaceDirCount, searchPathCount: number
159+
* - attempt: 0 = first try, 1 = retry
160+
* - errorType: classified error category, on failure only
161+
* - locatorsJson: JSON-serialized Record<locatorName, ms>. Locator set is platform-dependent
162+
* so a flat blob is more practical than a fixed schema. Parse with parse_json() in Kusto.
163+
* Measures (numeric; phases run in parallel so sum may exceed total wall-clock):
164+
* - breakdownLocators: ms in locator plugins
165+
* - breakdownPathEnv: ms scanning PATH env var entries (not a file path)
166+
* - breakdownGlobalVirtualEnvs: ms scanning global virtualenv dirs
167+
* - breakdownWorkspaces: ms scanning workspace dirs
155168
*/
156169
PET_REFRESH = 'PET.REFRESH',
157170
/**
@@ -166,9 +179,15 @@ export enum EventNames {
166179
/**
167180
* Telemetry event for PET process restart attempts.
168181
* Properties:
169-
* - attempt: number (1-based restart attempt number)
182+
* - attempt: 1-based restart attempt number
170183
* - result: 'success' | 'error'
171-
* - errorType: string (classified error category, on failure only)
184+
* - errorType: classified error category, on failure only
185+
* - triggerReason: why the restart was needed (lets us separate crashes from
186+
* timeout-induced kills; the most specific reason wins — rpc_* recorded before a
187+
* kill is not overwritten by the subsequent exit event). Values:
188+
* rpc_connection_error | rpc_resolve_timeout | rpc_refresh_timeout |
189+
* rpc_configure_timeout | process_exit:<code>:<signal> | process_error |
190+
* start_failed | unknown
172191
*/
173192
PET_PROCESS_RESTART = 'PET.PROCESS_RESTART',
174193
/**
@@ -391,12 +410,20 @@ export interface IEventNamePropertyMapping {
391410
/* __GDPR__
392411
"setup.hang_detected": {
393412
"failureStage": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "StellaHuang95" },
413+
"globalScopeDeferred": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" },
394414
"<duration>": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" },
395415
"<stageDuration>": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" }
396416
}
397417
*/
398418
[EventNames.SETUP_HANG_DETECTED]: {
399419
failureStage: string;
420+
/**
421+
* Distinguishes a real foreground hang from the benign 120s background global-scope scan.
422+
* - 'deferred': workspace env resolved; global scope ran in background.
423+
* - 'not_deferred': no workspace env; global scope was awaited as primary fallback.
424+
* - 'unknown': hang fired before env-selection reached the global-scope decision.
425+
*/
426+
globalScopeDeferred: 'deferred' | 'not_deferred' | 'unknown';
400427
};
401428

402429
/* __GDPR__
@@ -422,6 +449,19 @@ export interface IEventNamePropertyMapping {
422449
errorType?: string;
423450
};
424451

452+
/* __GDPR__
453+
"pet.version": {
454+
"version": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" },
455+
"source": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" }
456+
}
457+
*/
458+
[EventNames.PET_VERSION]: {
459+
/** Version string reported by `pet --version` (e.g. '0.1.0'), or 'unknown' if the lookup failed. */
460+
version: string;
461+
/** Which extension shipped the PET binary that's being used. */
462+
source: 'envs_extension' | 'python_extension';
463+
};
464+
425465
/* __GDPR__
426466
"env_selection.started": {
427467
"registeredManagerCount": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" },
@@ -515,6 +555,11 @@ export interface IEventNamePropertyMapping {
515555
"searchPathCount": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" },
516556
"attempt": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" },
517557
"errorType": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" },
558+
"breakdownLocators": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "isMeasurement": true, "owner": "eleanorjboyd" },
559+
"breakdownPathEnv": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "isMeasurement": true, "owner": "eleanorjboyd" },
560+
"breakdownGlobalVirtualEnvs": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "isMeasurement": true, "owner": "eleanorjboyd" },
561+
"breakdownWorkspaces": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "isMeasurement": true, "owner": "eleanorjboyd" },
562+
"locatorsJson": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "owner": "eleanorjboyd" },
518563
"<duration>": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" }
519564
}
520565
*/
@@ -526,6 +571,17 @@ export interface IEventNamePropertyMapping {
526571
searchPathCount?: number;
527572
attempt: number;
528573
errorType?: string;
574+
// breakdown* fields go through the measures payload (numeric); listed here for GDPR only.
575+
/** ms in the Locators phase. */
576+
breakdownLocators?: number;
577+
/** ms walking PATH env var entries (not a file path). */
578+
breakdownPathEnv?: number;
579+
/** ms scanning global virtual-env dirs. */
580+
breakdownGlobalVirtualEnvs?: number;
581+
/** ms scanning workspace dirs. */
582+
breakdownWorkspaces?: number;
583+
/** JSON-serialized Record<locatorName, ms>. Parse with parse_json() in Kusto. */
584+
locatorsJson?: string;
529585
};
530586

531587
/* __GDPR__
@@ -549,13 +605,22 @@ export interface IEventNamePropertyMapping {
549605
"attempt": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" },
550606
"result": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" },
551607
"errorType": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" },
608+
"triggerReason": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "eleanorjboyd" },
552609
"<duration>": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "eleanorjboyd" }
553610
}
554611
*/
555612
[EventNames.PET_PROCESS_RESTART]: {
556613
attempt: number;
557614
result: 'success' | 'error';
558615
errorType?: string;
616+
/**
617+
* Why the restart was needed. The most specific reason wins (an rpc_* value recorded
618+
* before the kill is not overwritten by the subsequent exit/error event).
619+
* Values: rpc_connection_error | rpc_resolve_timeout | rpc_refresh_timeout |
620+
* rpc_configure_timeout | process_exit:<code>:<signal> | process_error |
621+
* start_failed | unknown.
622+
*/
623+
triggerReason: string;
559624
};
560625

561626
/* __GDPR__

src/extension.ts

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,12 @@ import { collectEnvironmentInfo, getEnvManagerAndPackageManagerConfigLevels, run
9696
import { EnvironmentManagers, ProjectCreators, PythonProjectManager } from './internal.api';
9797
import { registerSystemPythonFeatures } from './managers/builtin/main';
9898
import { SysPythonManager } from './managers/builtin/sysPythonManager';
99-
import { createNativePythonFinder, NativePythonFinder } from './managers/common/nativePythonFinder';
99+
import {
100+
createNativePythonFinder,
101+
getNativePythonToolsPathAndSource,
102+
getNativePythonToolsVersion,
103+
NativePythonFinder,
104+
} from './managers/common/nativePythonFinder';
100105
import { IDisposable } from './managers/common/types';
101106
import { registerCondaFeatures } from './managers/conda/main';
102107
import { registerPipenvFeatures } from './managers/pipenv/main';
@@ -553,6 +558,9 @@ export async function activate(context: ExtensionContext): Promise<PythonEnviron
553558
setImmediate(async () => {
554559
let failureStage = 'nativeFinder';
555560
const stageWatch = new StopWatch();
561+
// Mutable ref so the hang watchdog can report whether global scope was deferred
562+
// even if it fires mid-envSelection before applyInitialEnvironmentSelection returns.
563+
const globalScopeDeferredRef: { value: 'deferred' | 'not_deferred' | 'unknown' } = { value: 'unknown' };
556564
// Watchdog: fires if setup hasn't completed within 120s, indicating a likely hang
557565
const SETUP_HANG_TIMEOUT_MS = 120_000;
558566
let hangWatchdogActive = true;
@@ -572,7 +580,7 @@ export async function activate(context: ExtensionContext): Promise<PythonEnviron
572580
sendTelemetryEvent(
573581
EventNames.SETUP_HANG_DETECTED,
574582
{ duration: start.elapsedTime, stageDuration: stageWatch.elapsedTime },
575-
{ failureStage },
583+
{ failureStage, globalScopeDeferred: globalScopeDeferredRef.value },
576584
);
577585
}, SETUP_HANG_TIMEOUT_MS);
578586
context.subscriptions.push({ dispose: clearHangWatchdog });
@@ -583,6 +591,19 @@ export async function activate(context: ExtensionContext): Promise<PythonEnviron
583591
try {
584592
nativeFinder = await createNativePythonFinder(outputChannel, api, context);
585593
sendTelemetryEvent(EventNames.PET_INIT_DURATION, petStart.elapsedTime, { result: 'success' });
594+
// Fire-and-forget: report the bundled PET binary version so other PET telemetry
595+
// can be sliced by version. Don't block activation on this.
596+
void getNativePythonToolsPathAndSource()
597+
.then(async ({ toolPath, source }) => {
598+
const version = await getNativePythonToolsVersion(toolPath);
599+
sendTelemetryEvent(EventNames.PET_VERSION, undefined, { version, source });
600+
})
601+
.catch(() => {
602+
sendTelemetryEvent(EventNames.PET_VERSION, undefined, {
603+
version: 'unknown',
604+
source: 'python_extension',
605+
});
606+
});
586607
} catch (petError) {
587608
sendTelemetryEvent(
588609
EventNames.PET_INIT_DURATION,
@@ -618,7 +639,14 @@ export async function activate(context: ExtensionContext): Promise<PythonEnviron
618639

619640
failureStage = 'envSelection';
620641
stageWatch.reset();
621-
await applyInitialEnvironmentSelection(envManagers, projectManager, nativeFinder, api, start.elapsedTime);
642+
await applyInitialEnvironmentSelection(
643+
envManagers,
644+
projectManager,
645+
nativeFinder,
646+
api,
647+
start.elapsedTime,
648+
globalScopeDeferredRef,
649+
);
622650

623651
// Register manager-agnostic terminal watcher for package-modifying commands
624652
failureStage = 'terminalWatcher';

src/features/interpreterSelection.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ export async function applyInitialEnvironmentSelection(
284284
nativeFinder: NativePythonFinder,
285285
api: PythonEnvironmentApi,
286286
activationToReadyDurationMs?: number,
287+
globalScopeDeferredRef?: { value: 'deferred' | 'not_deferred' | 'unknown' },
287288
): Promise<void> {
288289
const folders = getWorkspaceFolders() ?? [];
289290
traceInfo(
@@ -377,6 +378,9 @@ export async function applyInitialEnvironmentSelection(
377378
if (workspaceFolderResolved) {
378379
// Defer global scope so it doesn't block post-selection startup.
379380
traceInfo('[interpreterSelection] Workspace env resolved, deferring global scope to background');
381+
if (globalScopeDeferredRef) {
382+
globalScopeDeferredRef.value = 'deferred';
383+
}
380384
resolveGlobalScope()
381385
.then(async (globalErrors) => {
382386
if (globalErrors.length > 0) {
@@ -386,6 +390,9 @@ export async function applyInitialEnvironmentSelection(
386390
.catch((err) => traceError(`[interpreterSelection] Background global scope resolution failed: ${err}`));
387391
} else {
388392
// No workspace folder resolved — global scope is the primary fallback, must await.
393+
if (globalScopeDeferredRef) {
394+
globalScopeDeferredRef.value = 'not_deferred';
395+
}
389396
const globalErrors = await resolveGlobalScope();
390397
allErrors.push(...globalErrors);
391398
}

0 commit comments

Comments
 (0)