Skip to content

Commit 9f224f7

Browse files
JohnMcLearclaude
andauthored
feat: surface #7762 metrics in CSV + enable in dive workflow (#100)
Three follow-ups to make the next dive run informative: 1. Workflow now patches settings.json to set scalingDiveMetrics=true (added to core in #7762 but defaulting off there per project rule). Without this the three new metric rows the harness wants would never appear on /stats/prometheus. 2. CSV column rename: evloop_p95_ms -> evloop_p99_ms. prom-client's collectDefaultMetrics emits nodejs_eventloop_lag_p99_seconds (p50/p90/p99 — no p95), so the previous lookup was always empty. MD table header updated to match. 3. Two new curated CSV columns: - apply_mean_ms: from etherpad_changeset_apply_duration_seconds_sum and _count (histogram mean), converted to ms. Lets the dive attribute server-side latency to the apply path vs fan-out. - emits_new_changes: from etherpad_socket_emits_total{type=NEW_CHANGES}. Dominant fan-out cost; the column makes the batching lever's payoff visible. Both new columns are populated only when the underlying metrics exist on the SUT; older builds get empty cells (existing pattern). 48 tests green. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 542fbd9 commit 9f224f7

3 files changed

Lines changed: 40 additions & 13 deletions

File tree

.github/workflows/scaling-dive.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ jobs:
9999
s!"points":[^,]*!"points": 1000!
100100
' settings.json.template > settings.json
101101
102+
# Enable the scaling-dive metrics added in ether/etherpad#7762.
103+
# The key isn't in settings.json.template yet (defaults to false in
104+
# Settings.ts), so we inject it. Use node so JSON manipulation
105+
# stays valid regardless of key ordering / whitespace.
106+
node -e "const fs=require('fs'); const s=JSON.parse(fs.readFileSync('settings.json','utf8')); s.scalingDiveMetrics=true; fs.writeFileSync('settings.json', JSON.stringify(s, null, 2));"
107+
echo 'enabled: scalingDiveMetrics = true'
108+
102109
case "${{ matrix.lever }}" in
103110
baseline)
104111
echo "baseline — no further settings tweaks"

src/sim/reporter.ts

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,14 @@ export class Reporter {
6060
}
6161

6262
private toCsv(): string {
63-
const header = 'step,p50,p95,p99,max,throughput_csps,cpu_user,evloop_p95_ms,rss_mb,users,errors,break';
63+
// CSV column set is fixed. New gauges added via --scrape-keep are
64+
// queryable from JSON only — that's intentional so the CSV schema
65+
// stays stable across runs.
66+
//
67+
// apply_mean_ms and emits_new_changes are populated only when
68+
// ether/etherpad's `scalingDiveMetrics` setting is true (registered
69+
// since #7762). The harness tolerates them being absent.
70+
const header = 'step,p50,p95,p99,max,throughput_csps,cpu_user,evloop_p99_ms,rss_mb,users,apply_mean_ms,emits_new_changes,errors,break';
6471
const fmt = (n: number | undefined): string =>
6572
n === undefined || !Number.isFinite(n) ? '' : String(n);
6673
const cell = (g: Record<string, number>, key: string): string => {
@@ -70,12 +77,21 @@ export class Reporter {
7077
const rows = this.steps.map((s) => {
7178
const g = s.snapshot.gauges;
7279
// CSV column → /stats/prometheus row mapping. Names come from prom-client's
73-
// collectDefaultMetrics output that etherpad core actually emits.
80+
// collectDefaultMetrics output that etherpad core actually emits, plus the
81+
// three custom metrics added in ether/etherpad#7762.
7482
const rssBytes = g['process_resident_memory_bytes'];
7583
const rssMb = rssBytes !== undefined ? Math.round(rssBytes / 1_048_576) : undefined;
76-
const evloopP95s = g['nodejs_eventloop_lag_p95_seconds'];
77-
const evloopP95Ms = evloopP95s !== undefined ? Math.round(evloopP95s * 1000) : undefined;
84+
// prom-client exposes _p50/_p90/_p99 (no p95). p99 is the closest tail metric.
85+
const evloopP99s = g['nodejs_eventloop_lag_p99_seconds'];
86+
const evloopP99Ms = evloopP99s !== undefined ? Math.round(evloopP99s * 1000) : undefined;
7887
const cpuUserS = g['process_cpu_user_seconds_total'];
88+
// Histogram: mean = sum / count, converted to ms.
89+
const applySum = g['etherpad_changeset_apply_duration_seconds_sum'];
90+
const applyCount = g['etherpad_changeset_apply_duration_seconds_count'];
91+
const applyMeanMs = applySum !== undefined && applyCount !== undefined && applyCount > 0
92+
? Math.round((applySum / applyCount) * 1000 * 100) / 100
93+
: undefined;
94+
const emitsNewChanges = g['etherpad_socket_emits_total{type=NEW_CHANGES}'];
7995
return [
8096
s.step,
8197
fmt(s.latencyMs.p50),
@@ -84,9 +100,11 @@ export class Reporter {
84100
fmt(s.latencyMs.max),
85101
fmt(s.throughputCsps),
86102
fmt(cpuUserS),
87-
fmt(evloopP95Ms),
103+
fmt(evloopP99Ms),
88104
fmt(rssMb),
89105
cell(g, 'etherpad_total_users'),
106+
fmt(applyMeanMs),
107+
fmt(emitsNewChanges),
90108
s.errors,
91109
s.breakageFlags.join('|'),
92110
].join(',');
@@ -101,12 +119,12 @@ export class Reporter {
101119
`Run: ${m.runId}`,
102120
`SUT: ${m.sut.gitSha ?? '?'} (${m.sut.version ?? '?'}) on ${m.machine.cpus} · ${m.machine.totalMemMB} MB · node ${m.machine.node}`,
103121
'',
104-
'| Step | p50 | p95 | p99 | EL p95 | CPU% | Errors | Break |',
122+
'| Step | p50 | p95 | p99 | EL p99 | CPU% | Errors | Break |',
105123
'|---:|---:|---:|---:|---:|---:|---:|:---|',
106124
];
107125
const rows = this.steps.map((s) => {
108126
const g = s.snapshot.gauges;
109-
const elS = g['nodejs_eventloop_lag_p95_seconds'];
127+
const elS = g['nodejs_eventloop_lag_p99_seconds'];
110128
const elMs = elS !== undefined ? Math.round(elS * 1000) : '';
111129
const cpuS = g['process_cpu_user_seconds_total'];
112130
const cpu = cpuS !== undefined ? cpuS.toFixed(2) : '';

tests/sim/reporter.test.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ describe('Reporter CSV output', () => {
6666
const paths = await r.write();
6767
const csv = readFileSync(paths.csv, 'utf8');
6868
const lines = csv.trim().split('\n');
69-
expect(lines[0]).toBe('step,p50,p95,p99,max,throughput_csps,cpu_user,evloop_p95_ms,rss_mb,users,errors,break');
69+
expect(lines[0]).toBe('step,p50,p95,p99,max,throughput_csps,cpu_user,evloop_p99_ms,rss_mb,users,apply_mean_ms,emits_new_changes,errors,break');
7070
expect(lines[1]!.split(',')[0]).toBe('10');
7171
expect(lines[1]!.split(',')[1]).toBe('20');
7272
expect(lines[2]!.split(',')[0]).toBe('20');
@@ -81,11 +81,13 @@ describe('Reporter CSV output', () => {
8181
const paths = await r.write();
8282
const csv = readFileSync(paths.csv, 'utf8');
8383
const cols = csv.trim().split('\n')[1]!.split(',');
84-
// step,p50,p95,p99,max,throughput_csps,<cpu_user>,<evloop_p95_ms>,<rss_mb>,<users>,errors,break
85-
expect(cols[6]).toBe(''); // cpu_user missing
86-
expect(cols[7]).toBe(''); // evloop missing
87-
expect(cols[8]).toBe(''); // rss missing
88-
expect(cols[9]).toBe(''); // users missing
84+
// step,p50,p95,p99,max,throughput_csps,<cpu_user>,<evloop_p99_ms>,<rss_mb>,<users>,<apply_mean_ms>,<emits_new_changes>,errors,break
85+
expect(cols[6]).toBe(''); // cpu_user missing
86+
expect(cols[7]).toBe(''); // evloop missing
87+
expect(cols[8]).toBe(''); // rss missing
88+
expect(cols[9]).toBe(''); // users missing
89+
expect(cols[10]).toBe(''); // apply_mean_ms missing
90+
expect(cols[11]).toBe(''); // emits_new_changes missing
8991
});
9092
});
9193

0 commit comments

Comments
 (0)