Skip to content

Commit eda32cd

Browse files
authored
feat: scrape metrics data (#22840)
.
1 parent c6ae018 commit eda32cd

10 files changed

Lines changed: 1071 additions & 85 deletions

File tree

bootstrap.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ case "$cmd" in
785785
;;
786786
"ci-network-bench-10tps")
787787
# Args: <env_file> <namespace> [docker_image]
788-
# Deploys bench-10tps and runs the 38-min sustained 10 TPS benchmark.
788+
# Deploys bench-10tps and runs the 10-min sustained 10 TPS benchmark.
789789
# Cleanup is done separately via ci-network-teardown.
790790
export CI=1
791791
env_file="${1:?env_file is required}"

ci.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ case "$cmd" in
268268
;;
269269
network-bench-10tps)
270270
# Args: <scenario> <namespace> [docker_image]
271-
# Deploys the bench-10tps network and runs the 38-min 10 TPS benchmark.
271+
# Deploys the bench-10tps network and runs the 10-min 10 TPS benchmark.
272272
export CI_DASHBOARD="network"
273273
export JOB_ID="x-${2:?namespace is required}-network-bench-10tps" CPUS=16
274274
export INSTANCE_POSTFIX="n-bench-10tps"

spartan/bootstrap.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ function block_capacity_bench_cmds {
185185
function bench_10tps_cmds {
186186
local high_value_tps=10
187187
local low_value_tps=0
188-
local test_duration=${TEST_DURATION_SECONDS:-2280} # approx 1 epoch
188+
local test_duration=${TEST_DURATION_SECONDS:-600} # 10 mins
189189
local timeout=${BENCH_TIMEOUT_SECONDS:-3600}
190190
echo "$(hash):TIMEOUT=${timeout} BENCH_RUN_ID=${BENCH_RUN_ID:-} BENCH_OUTPUT=bench-out/n_tps.10tps.bench.json BENCH_SCENARIO=10tps LOW_VALUE_TPS=${low_value_tps} HIGH_VALUE_TPS=${high_value_tps} TEST_DURATION_SECONDS=${test_duration} $root/yarn-project/end-to-end/scripts/run_test.sh simple n_tps.test.ts"
191191
}
@@ -259,6 +259,8 @@ function bench_10tps {
259259
--target-tps 10 \
260260
--workload sha256_hash_1024 \
261261
--output "$run_json" \
262+
--wait-for-pending-zero \
263+
--max-pending-wait-seconds "${BENCH_SCRAPE_MAX_PENDING_WAIT_SECONDS:-3600}" \
262264
|| echo "[bench_10tps] scraper failed (non-fatal)"
263265
network_bench_upload "$run_json" || echo "[network_bench] upload failed (non-fatal)"
264266
else

spartan/environments/bench-10tps.env

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ AZTEC_SLOT_DURATION=72
1717
AZTEC_PROOF_SUBMISSION_EPOCHS=2
1818
AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET=1
1919
AZTEC_LAG_IN_EPOCHS_FOR_RANDAO=1
20+
AZTEC_INBOX_LAG=2
2021

2122
# 2B mana target - good for about ~800 txs at 2.5M mana each
2223
AZTEC_MANA_TARGET=2000000000
@@ -39,6 +40,7 @@ SEQ_MAX_TX_PER_CHECKPOINT=800
3940
P2P_MAX_PENDING_TX_COUNT=20000
4041
SEQ_MIN_TX_PER_BLOCK=1
4142
SEQ_BUILD_CHECKPOINT_IF_EMPTY=true
43+
SEQ_ENABLE_PROPOSER_PIPELINING=true
4244

4345
RPC_REPLICAS=1
4446
RPC_RESOURCE_PROFILE="prod"
@@ -64,7 +66,7 @@ AZTEC_SLASHING_OFFSET_IN_ROUNDS=1
6466
AZTEC_LOCAL_EJECTION_THRESHOLD=90000000000000000000
6567

6668
DEBUG_P2P_INSTRUMENT_MESSAGES=true
67-
LOG_LEVEL='info;debug:simulator:public-processor'
69+
LOG_LEVEL='info;debug:simulator:public-processor,sequencer:state,sequencer:checkpoint-events'
6870

6971
VALIDATOR_L1_PRIORITY_FEE_BUMP_PERCENTAGE=0
7072
VALIDATOR_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE=0

spartan/scripts/bench_10tps/bench_output.schema.json

Lines changed: 134 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838
"description": "Discrete occurrences during the run (currently just chain prunes). Typically rendered as annotations on charts.",
3939
"items": { "$ref": "#/$defs/event" }
4040
},
41+
"sequencerStateSlots": {
42+
"type": "array",
43+
"description": "Per-slot sequencer state time budget reconstructed from structured state-transition logs. Optional for older runs and empty when sequencer debug logs were not enabled.",
44+
"items": { "$ref": "#/$defs/sequencerStateSlot" }
45+
},
4146
"notes": {
4247
"type": "array",
4348
"items": { "type": "string" },
@@ -65,12 +70,29 @@
6570
"format": "date-time",
6671
"description": "Wall-clock time the load generator stopped sending txs (not the test-teardown time)."
6772
},
73+
"inclusionEndedAt": {
74+
"type": "string",
75+
"format": "date-time",
76+
"description": "Wall-clock time the scraper considers proposer-visible load fully included. With pending-drain scraping enabled, this is when validator pending TxPool depth first reached zero; otherwise it falls back to the bounded scrape window end. RPC/full-node pending may remain non-zero when load failed to propagate to validators."
77+
},
6878
"drainEndedAt": {
6979
"type": "string",
7080
"format": "date-time",
7181
"description": "Wall-clock time the scraper began querying Prometheus. Typically endedAt + ~90s to let the OTel batch push (60s default) and one Prom scrape (15s) settle."
7282
},
7383
"namespace": { "type": "string", "examples": ["bench-10tps"] },
84+
"gcpProject": {
85+
"type": "string",
86+
"description": "GCP project containing the GKE container logs."
87+
},
88+
"gcpLocation": {
89+
"type": "string",
90+
"description": "GKE cluster location used by Cloud Logging resource labels."
91+
},
92+
"gkeCluster": {
93+
"type": "string",
94+
"description": "GKE cluster name used by Cloud Logging resource labels."
95+
},
7496
"image": {
7597
"type": "string",
7698
"description": "Aztec image tag or digest the validators ran."
@@ -109,7 +131,35 @@
109131
"minimum": 1,
110132
"description": "PromQL range-query step."
111133
},
112-
"promUrl": { "type": "string" }
134+
"promUrl": { "type": "string" },
135+
"waitForPendingZero": {
136+
"type": "boolean",
137+
"description": "Whether live scraping waited for validator pending TxPool depth to reach zero before querying."
138+
},
139+
"maxPendingWaitSeconds": {
140+
"type": "integer",
141+
"minimum": 0,
142+
"description": "Maximum time the scraper was allowed to wait for validator pending TxPool depth to reach zero."
143+
},
144+
"pendingAtScrape": {
145+
"type": ["number", "null"],
146+
"minimum": 0,
147+
"description": "Validator pending TxPool depth observed when scraping started, or null when the pending drain gate was disabled."
148+
},
149+
"pendingByRoleAtScrape": {
150+
"type": ["object", "null"],
151+
"description": "Pending TxPool depth by pod role at scrape start. RPC/full-node pending can remain non-zero after validators drain, which indicates load that did not propagate to proposers before expiry.",
152+
"additionalProperties": false,
153+
"properties": {
154+
"rpc": { "type": ["number", "null"], "minimum": 0 },
155+
"validator": { "type": ["number", "null"], "minimum": 0 },
156+
"fullNode": { "type": ["number", "null"], "minimum": 0 }
157+
}
158+
},
159+
"pendingWaitTimedOut": {
160+
"type": "boolean",
161+
"description": "True if scraping began because the pending-drain timeout expired."
162+
}
113163
}
114164
}
115165
}
@@ -127,9 +177,12 @@
127177
"targetTps": { "type": "number" },
128178
"inclusionTpsMean": {
129179
"type": ["number", "null"],
130-
"description": "Mean inclusion TPS over the steady-state window (run window minus warm-up)."
180+
"description": "Exact block-log inclusion throughput over the observed inclusion window: totalTxsMined / (inclusionEndedAt - startedAt)."
181+
},
182+
"inclusionTpsPeak": {
183+
"type": ["number", "null"],
184+
"description": "Peak sampled Prometheus rolling inclusion rate over the observed scrape window."
131185
},
132-
"inclusionTpsPeak": { "type": ["number", "null"] },
133186
"inclusionLatencyP50Ms": { "type": ["number", "null"] },
134187
"inclusionLatencyP95Ms": { "type": ["number", "null"] },
135188
"inclusionLatencyP99Ms": { "type": ["number", "null"] },
@@ -168,6 +221,7 @@
168221
"mempoolSizeRpc": { "$ref": "#/$defs/timeSeries" },
169222
"mempoolSizeValidator": { "$ref": "#/$defs/timeSeries" },
170223
"mempoolSizeFullNode": { "$ref": "#/$defs/timeSeries" },
224+
"mempoolMinedMax": { "$ref": "#/$defs/timeSeries" },
171225
"mempoolEvictedByReasonRate": { "$ref": "#/$defs/timeSeries" },
172226
"mempoolRejectedByReasonRate": { "$ref": "#/$defs/timeSeries" },
173227
"blockBuildDurationP95": { "$ref": "#/$defs/timeSeries" },
@@ -181,7 +235,7 @@
181235
"l1InclusionDelayP95": { "$ref": "#/$defs/timeSeries" },
182236
"gossipLatencyP95": { "$ref": "#/$defs/timeSeries" },
183237
"peerCountMean": { "$ref": "#/$defs/timeSeries" },
184-
"attestationsCollectDurationP95": { "$ref": "#/$defs/timeSeries" },
238+
"attestationsCollectDurationMean": { "$ref": "#/$defs/timeSeries" },
185239
"attestationsCollectAllowanceMean": { "$ref": "#/$defs/timeSeries" },
186240
"txCollectorTxsFromMempoolRate": { "$ref": "#/$defs/timeSeries" },
187241
"txCollectorTxsFromP2pRate": { "$ref": "#/$defs/timeSeries" },
@@ -203,7 +257,7 @@
203257
},
204258
"unit": {
205259
"type": "string",
206-
"examples": ["ms", "tps", "gas/s", "count"]
260+
"examples": ["ms", "tps", "mana/s", "count"]
207261
},
208262
"source": {
209263
"type": "string",
@@ -231,7 +285,7 @@
231285
"labels": {
232286
"type": "object",
233287
"additionalProperties": { "type": "string" },
234-
"description": "Prometheus labels that disambiguate this series. Empty {} for single-series queries. Common keys: k8s_pod_name, aztec_gossip_topic_name, rejection_reason, sequencer_state."
288+
"description": "Prometheus labels that disambiguate this series. Empty {} for single-series queries. Common keys: k8s_pod_name, aztec_gossip_topic_name, rejection_reason, aztec_sequencer_state."
235289
},
236290
"points": {
237291
"type": "array",
@@ -304,7 +358,7 @@
304358
"required": ["at", "type"],
305359
"properties": {
306360
"at": { "type": "string", "format": "date-time" },
307-
"type": { "type": "string", "const": "chainPruned" },
361+
"type": { "type": "string", "enum": ["chainPruned", "slotSummary"] },
308362
"source": { "type": "string", "const": "log" },
309363
"fromBlock": {
310364
"type": "integer",
@@ -313,6 +367,79 @@
313367
"toBlock": {
314368
"type": "integer",
315369
"description": "For chainPruned: the post-prune tip."
370+
},
371+
"slotNumber": {
372+
"type": "integer",
373+
"description": "For slotSummary: L2 slot number."
374+
},
375+
"buildSlot": {
376+
"type": "integer",
377+
"description": "For slotSummary: wall-clock slot in which the checkpoint was built."
378+
},
379+
"checkpointNumber": { "type": "integer" },
380+
"sourcePod": { "type": "string" },
381+
"proposer": {
382+
"type": "string",
383+
"description": "Validator/proposer address selected for this slot."
384+
},
385+
"attestorAddress": { "type": "string" },
386+
"publisherAddress": { "type": "string" },
387+
"blocksBuilt": { "type": "number", "minimum": 0 },
388+
"txCount": { "type": "number", "minimum": 0 },
389+
"totalMana": { "type": "number", "minimum": 0 },
390+
"blockBuildFailures": {
391+
"type": "array",
392+
"items": { "type": "object", "additionalProperties": true }
393+
},
394+
"checkpointBuildFailure": {
395+
"type": "object",
396+
"additionalProperties": true
397+
},
398+
"attestations": {
399+
"type": "object",
400+
"additionalProperties": true,
401+
"description": "For slotSummary: committee size, required/collected counts, and missing validator addresses when known."
402+
},
403+
"publish": {
404+
"type": "object",
405+
"additionalProperties": true,
406+
"description": "For slotSummary: checkpoint publish status and L1 publisher action breakdown."
407+
}
408+
}
409+
},
410+
411+
"sequencerStateSlot": {
412+
"type": "object",
413+
"additionalProperties": false,
414+
"required": ["slotNumber", "startedAt", "endedAt", "totalMs", "states"],
415+
"properties": {
416+
"slotNumber": {
417+
"type": "integer",
418+
"description": "L2 slot number whose sequencer-state durations are represented."
419+
},
420+
"startedAt": {
421+
"type": "string",
422+
"format": "date-time",
423+
"description": "Timestamp of the first parsed transition for this pod-slot."
424+
},
425+
"endedAt": {
426+
"type": "string",
427+
"format": "date-time",
428+
"description": "Timestamp of the last parsed transition for this pod-slot."
429+
},
430+
"sourcePod": {
431+
"type": "string",
432+
"description": "Validator pod whose transitions were selected for this slot. The scraper chooses the pod-slot with the strongest proposer-state signal."
433+
},
434+
"totalMs": {
435+
"type": "number",
436+
"minimum": 0,
437+
"description": "Sum of all state durations in this slot record."
438+
},
439+
"states": {
440+
"type": "object",
441+
"additionalProperties": { "type": "number", "minimum": 0 },
442+
"description": "Map from SequencerState name to total milliseconds spent in that state during this slot."
316443
}
317444
}
318445
}

0 commit comments

Comments
 (0)