Skip to content

Commit 8d1352b

Browse files
authored
feat(telemetry): instrument workspace lifecycle (#963)
Track workspace state machine and watcher events locally so build durations and agent transitions show up in the telemetry stream. - workspace.state_transitioned: deduped on (status, transition, reason), emits observedDurationMs and observedBuildDurationMs on provisioner resolve - workspace.agent.state_transitioned: deduped on (status, lifecycle_state), emits observedDurationMs - workspace.start.triggered / workspace.update.triggered: spans around user-initiated operations - workspace.update.prompted: separate span for the parameter prompt so dismissals are recorded as aborted rather than errored - Fix: REST-API fallback now correctly passes collected parameters to the new build (was silently dropping them, breaking required-param updates on CLI < 2.24) WorkspaceMonitor and WorkspaceStateMachine now take ServiceContainer directly to access the telemetry service. Closes part of #906
1 parent 3071259 commit 8d1352b

12 files changed

Lines changed: 854 additions & 73 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
discovery/loss/recovery with sampled network info, and reconnecting
2626
WebSocket open/drop/reconnect/state transitions.
2727
- Local telemetry now records authentication refresh and recovery prompts.
28+
- Local telemetry now records workspace and agent state transitions with
29+
observed durations.
2830
- Path-like settings (`coder.binaryDestination`, `coder.tlsCertFile`,
2931
`coder.tlsKeyFile`, `coder.tlsCaFile`, `coder.tlsAltHost`,
3032
`coder.proxyLogDirectory`) and items in `coder.globalFlags` now support
@@ -47,6 +49,10 @@
4749
domains with Punycode (`xn--`) labels, can now be opened from recent
4850
connections. The SSH authority parser was splitting these names across the
4951
field separator and rejecting the host as invalid.
52+
- Updating a workspace on a CLI older than 2.24 (which can't run
53+
`coder update` non-interactively) now passes newly-required template
54+
parameters into the REST-API fallback build, instead of silently omitting
55+
them and letting the server reject the build.
5056
- Updating a workspace from VS Code no longer hangs when the new template
5157
version requires parameters. The extension now prompts for any missing
5258
required values through VS Code input boxes and passes them to

src/api/updateParameters.ts

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import type { Api } from "coder/site/src/api/api";
44
import type {
55
TemplateVersionParameter,
66
Workspace,
7+
WorkspaceBuildParameter,
78
} from "coder/site/src/api/typesGenerated";
89

910
/** Thrown when the user dismisses a parameter prompt. */
@@ -15,14 +16,14 @@ export class WorkspaceUpdateCancelledError extends Error {
1516
}
1617

1718
/**
18-
* Prompts the user for any newly-required template parameters and returns
19-
* `--parameter name=value` args suitable for `coder update`. Throws
20-
* `WorkspaceUpdateCancelledError` if the user dismisses a prompt.
19+
* Prompts the user for any newly-required template parameters and returns the
20+
* collected `{ name, value }` pairs. Throws `WorkspaceUpdateCancelledError` if
21+
* the user dismisses a prompt.
2122
*/
2223
export async function collectUpdateParameters(
2324
restClient: Api,
2425
workspace: Workspace,
25-
): Promise<string[]> {
26+
): Promise<WorkspaceBuildParameter[]> {
2627
const [newParams, currentValues] = await Promise.all([
2728
restClient.getTemplateVersionRichParameters(
2829
workspace.template_active_version_id,
@@ -35,16 +36,16 @@ export async function collectUpdateParameters(
3536
const existing = new Set(currentValues.map((p) => p.name));
3637
const toPrompt = candidates.filter((p) => !existing.has(p.name));
3738

38-
const args: string[] = [];
39+
const collected: WorkspaceBuildParameter[] = [];
3940
for (let i = 0; i < toPrompt.length; i++) {
4041
const param = toPrompt[i];
4142
const value = await promptForParameter(param, i + 1, toPrompt.length);
4243
if (value === undefined) {
4344
throw new WorkspaceUpdateCancelledError();
4445
}
45-
args.push("--parameter", `${param.name}=${value}`);
46+
collected.push({ name: param.name, value });
4647
}
47-
return args;
48+
return collected;
4849
}
4950

5051
function promptForParameter(

src/api/workspace.ts

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ import * as vscode from "vscode";
44
import { getGlobalFlags, type CliAuth } from "../settings/cli";
55

66
import { errToStr, createWorkspaceIdentifier } from "./api-helper";
7-
import { collectUpdateParameters } from "./updateParameters";
87

98
import type { Api } from "coder/site/src/api/api";
109
import type {
1110
ProvisionerJobLog,
1211
Workspace,
1312
WorkspaceAgentLog,
13+
WorkspaceBuildParameter,
1414
} from "coder/site/src/api/typesGenerated";
1515

1616
import type { FeatureSet } from "../featureSet";
@@ -116,25 +116,31 @@ export async function startWorkspace(ctx: CliContext): Promise<Workspace> {
116116
}
117117

118118
/**
119-
* Update a workspace to the latest template version. Collects any newly-
120-
* required parameters via VS Code prompts and passes them to the CLI as flags
121-
* (the resolver phase can't render an interactive terminal). Falls back to
122-
* the REST API for CLIs older than 2.24.
119+
* Update a workspace to the latest template version. Callers must collect
120+
* any newly-required parameters via `collectUpdateParameters` first; this
121+
* function does not prompt. Falls back to the REST API on CLIs older than
122+
* 2.24.
123123
*/
124-
export async function updateWorkspace(ctx: CliContext): Promise<Workspace> {
124+
export async function updateWorkspace(
125+
ctx: CliContext,
126+
parameters: WorkspaceBuildParameter[],
127+
): Promise<Workspace> {
125128
if (!ctx.featureSet.cliUpdate) {
126-
return updateWorkspaceVersion(ctx);
129+
return updateWorkspaceViaApi(ctx, parameters);
127130
}
128131

129-
const paramArgs = await collectUpdateParameters(
130-
ctx.restClient,
131-
ctx.workspace,
132-
);
132+
const paramArgs = parameters.flatMap((p) => [
133+
"--parameter",
134+
`${p.name}=${p.value}`,
135+
]);
133136
await runCliCommand(ctx, ["update", ...paramArgs]);
134137
return ctx.restClient.getWorkspace(ctx.workspace.id);
135138
}
136139

137-
async function updateWorkspaceVersion(ctx: CliContext): Promise<Workspace> {
140+
async function updateWorkspaceViaApi(
141+
ctx: CliContext,
142+
parameters: WorkspaceBuildParameter[],
143+
): Promise<Workspace> {
138144
if (ctx.workspace.latest_build.status === "running") {
139145
ctx.write("Stopping workspace for update...\r\n");
140146
const stopBuild = await ctx.restClient.stopWorkspace(ctx.workspace.id);
@@ -145,7 +151,13 @@ async function updateWorkspaceVersion(ctx: CliContext): Promise<Workspace> {
145151
}
146152

147153
ctx.write("Starting workspace with updated template...\r\n");
148-
await ctx.restClient.updateWorkspaceVersion(ctx.workspace);
154+
const template = await ctx.restClient.getTemplate(ctx.workspace.template_id);
155+
await ctx.restClient.startWorkspace(
156+
ctx.workspace.id,
157+
template.active_version_id,
158+
undefined,
159+
parameters,
160+
);
149161
return ctx.restClient.getWorkspace(ctx.workspace.id);
150162
}
151163

src/instrumentation/workspace.ts

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
import { WorkspaceUpdateCancelledError } from "../api/updateParameters";
2+
3+
import type {
4+
Workspace,
5+
WorkspaceAgent,
6+
WorkspaceAgentLifecycle,
7+
WorkspaceAgentStatus,
8+
WorkspaceBuild,
9+
WorkspaceBuildParameter,
10+
WorkspaceStatus,
11+
} from "coder/site/src/api/typesGenerated";
12+
13+
import type { TelemetryReporter } from "../telemetry/reporter";
14+
15+
/** Sentinel for `from*` before any state is observed. `"unknown"` is a real server-reported value, so avoid it. */
16+
const INITIAL_STATE = "none";
17+
18+
/** Statuses where a provisioner job is actively running. */
19+
const PROVISIONING_STATUSES: ReadonlySet<WorkspaceStatus> = new Set([
20+
"pending",
21+
"starting",
22+
"stopping",
23+
"canceling",
24+
"deleting",
25+
]);
26+
27+
interface ObservedWorkspaceState {
28+
readonly status: WorkspaceStatus;
29+
readonly transition: WorkspaceBuild["transition"];
30+
readonly reason: WorkspaceBuild["reason"];
31+
readonly observedAtMs: number;
32+
}
33+
34+
interface ObservedAgentState {
35+
readonly status: WorkspaceAgentStatus;
36+
readonly lifecycleState: WorkspaceAgentLifecycle;
37+
readonly observedAtMs: number;
38+
}
39+
40+
/**
41+
* Emits `workspace.state_transitioned` as a workspace progresses through
42+
* statuses, plus `observedBuildDurationMs` when a provisioner run resolves.
43+
* Construct one per workspace; `WorkspaceMonitor` is the sole call site.
44+
*/
45+
export class WorkspaceStateTelemetry {
46+
private observed: ObservedWorkspaceState | undefined;
47+
/** Set on first observation of a provisioning status; cleared when the build resolves. */
48+
private buildStartedAtMs: number | undefined;
49+
50+
public constructor(
51+
private readonly telemetry: TelemetryReporter,
52+
private readonly workspaceName: string,
53+
) {}
54+
55+
public observe(workspace: Workspace): void {
56+
const { status, transition, reason } = workspace.latest_build;
57+
const previous = this.observed;
58+
if (
59+
previous?.status === status &&
60+
previous.transition === transition &&
61+
previous.reason === reason
62+
) {
63+
return;
64+
}
65+
66+
const now = performance.now();
67+
const measurements: Record<string, number> = previous
68+
? { observedDurationMs: now - previous.observedAtMs }
69+
: {};
70+
71+
const wasProvisioning =
72+
previous && PROVISIONING_STATUSES.has(previous.status);
73+
const isProvisioning = PROVISIONING_STATUSES.has(status);
74+
if (isProvisioning) {
75+
this.buildStartedAtMs ??= now;
76+
} else {
77+
if (wasProvisioning && this.buildStartedAtMs !== undefined) {
78+
measurements.observedBuildDurationMs = now - this.buildStartedAtMs;
79+
}
80+
this.buildStartedAtMs = undefined;
81+
}
82+
83+
this.telemetry.log(
84+
"workspace.state_transitioned",
85+
{
86+
workspaceName: this.workspaceName,
87+
from: previous?.status ?? INITIAL_STATE,
88+
to: status,
89+
transition,
90+
reason,
91+
},
92+
measurements,
93+
);
94+
this.observed = { status, transition, reason, observedAtMs: now };
95+
}
96+
}
97+
98+
/**
99+
* Emits `workspace.agent.state_transitioned` as the agent's `status` and
100+
* `lifecycle_state` change. The agent has two state dimensions so the event
101+
* carries qualified `fromStatus`/`toStatus` and `fromLifecycleState`/
102+
* `toLifecycleState` properties. Construct one per workspace.
103+
*/
104+
export class WorkspaceAgentTelemetry {
105+
private observed: ObservedAgentState | undefined;
106+
107+
public constructor(
108+
private readonly telemetry: TelemetryReporter,
109+
private readonly workspaceName: string,
110+
) {}
111+
112+
public observe(agent: WorkspaceAgent): void {
113+
const previous = this.observed;
114+
if (
115+
previous?.status === agent.status &&
116+
previous.lifecycleState === agent.lifecycle_state
117+
) {
118+
return;
119+
}
120+
const now = performance.now();
121+
122+
this.telemetry.log(
123+
"workspace.agent.state_transitioned",
124+
{
125+
workspaceName: this.workspaceName,
126+
agentName: agent.name,
127+
fromStatus: previous?.status ?? INITIAL_STATE,
128+
toStatus: agent.status,
129+
fromLifecycleState: previous?.lifecycleState ?? INITIAL_STATE,
130+
toLifecycleState: agent.lifecycle_state,
131+
},
132+
previous ? { observedDurationMs: now - previous.observedAtMs } : {},
133+
);
134+
this.observed = {
135+
status: agent.status,
136+
lifecycleState: agent.lifecycle_state,
137+
observedAtMs: now,
138+
};
139+
}
140+
141+
public reset(): void {
142+
this.observed = undefined;
143+
}
144+
}
145+
146+
/**
147+
* Wraps user-initiated workspace operations (start, update) as traced spans.
148+
* Stateless; safe to construct per call site.
149+
*/
150+
export class WorkspaceOperationTelemetry {
151+
public constructor(
152+
private readonly telemetry: TelemetryReporter,
153+
private readonly workspaceName: string,
154+
) {}
155+
156+
public traceUpdateTriggered<T>(fn: () => Promise<T>): Promise<T> {
157+
return this.telemetry.trace("workspace.update.triggered", fn, {
158+
workspaceName: this.workspaceName,
159+
});
160+
}
161+
162+
public traceStartTriggered<T>(fn: () => Promise<T>): Promise<T> {
163+
return this.telemetry.trace("workspace.start.triggered", fn, {
164+
workspaceName: this.workspaceName,
165+
});
166+
}
167+
168+
/**
169+
* Records dismissal as `result: "aborted"`. The framework treats any throw
170+
* as `result: "error"`, so we return inside the span and rethrow outside.
171+
*/
172+
public async traceUpdatePrompted(
173+
fn: () => Promise<WorkspaceBuildParameter[]>,
174+
): Promise<WorkspaceBuildParameter[]> {
175+
let cancel: WorkspaceUpdateCancelledError | undefined;
176+
const parameters = await this.telemetry.trace(
177+
"workspace.update.prompted",
178+
async (span) => {
179+
try {
180+
return await fn();
181+
} catch (error) {
182+
if (error instanceof WorkspaceUpdateCancelledError) {
183+
span.markAborted();
184+
cancel = error;
185+
return [];
186+
}
187+
throw error;
188+
}
189+
},
190+
{ workspaceName: this.workspaceName },
191+
);
192+
if (cancel) throw cancel;
193+
return parameters;
194+
}
195+
}

src/remote/remote.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -312,8 +312,7 @@ export class Remote {
312312
const monitor = await WorkspaceMonitor.create(
313313
workspace,
314314
workspaceClient,
315-
this.logger,
316-
this.contextManager,
315+
this.serviceContainer,
317316
);
318317
disposables.push(
319318
monitor,
@@ -331,8 +330,8 @@ export class Remote {
331330
args.startupMode,
332331
binaryPath,
333332
featureSet,
334-
this.logger,
335333
cliAuth,
334+
this.serviceContainer,
336335
);
337336
disposables.push(stateMachine);
338337

0 commit comments

Comments
 (0)