Skip to content

Commit 7c9e3d5

Browse files
authored
fix: more robust metrics reporting in IRM monitor (#23038)
Fixes an issue where stuck requests would update the gauge after it was already updated by subsequent requests that succeeded quickly. This forces the gauge to always be updated in sequence, or the result is just dropped Also added some logging so we can see what's happening
1 parent cc2e612 commit 7c9e3d5

6 files changed

Lines changed: 125 additions & 30 deletions

File tree

.github/workflows/deploy-irm.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ jobs:
6969
CLUSTER_NAME: ${{ inputs.cluster }}
7070
GKE_CLUSTER_CONTEXT: "gke_testnet-440309_us-west1-a_${{ inputs.cluster }}"
7171
REGION: us-west1-a
72-
INFURA_SECRET_NAME: infura-${{ inputs.l1_network }}-url
72+
ETHEREUM_HOSTS_SECRET_NAME: irm-ethereum-hosts-${{ inputs.l1_network }}
7373

7474
runs-on: ubuntu-latest
7575
steps:
@@ -150,4 +150,4 @@ jobs:
150150
echo "L1 network: ${{ inputs.l1_network }}"
151151
echo "Image tag: ${IMAGE_TAG}"
152152
153-
./spartan/metrics/irm-monitor/scripts/update-monitoring.sh $NAMESPACE $MONITORING_NAMESPACE ${{ inputs.network }} $INFURA_SECRET_NAME
153+
./spartan/metrics/irm-monitor/scripts/update-monitoring.sh $NAMESPACE $MONITORING_NAMESPACE ${{ inputs.network }} $ETHEREUM_HOSTS_SECRET_NAME
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.0
1+
2.1.1

spartan/metrics/irm-monitor/index.ts

Lines changed: 116 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,50 @@ import express, { Request, Response } from "express";
22
import { createPublicClient, http } from "viem";
33
import client from "prom-client";
44

5-
const { ROLLUP_CONTRACT_ADDRESS, ETHEREUM_HOST, NETWORK } = process.env;
5+
const { ROLLUP_CONTRACT_ADDRESS, ETHEREUM_HOSTS, NETWORK } = process.env;
66

77
//////////////////////////////
88
// IMPORTANT: Bump VERSION file when making changes
99
//////////////////////////////
1010

11-
if (!ROLLUP_CONTRACT_ADDRESS || !ETHEREUM_HOST || !NETWORK) {
11+
const ethereumRpcUrls = (ETHEREUM_HOSTS ?? "")
12+
.split(",")
13+
.map((u: string) => u.trim())
14+
.filter(Boolean);
15+
16+
if (!ROLLUP_CONTRACT_ADDRESS || ethereumRpcUrls.length === 0 || !NETWORK) {
1217
console.error(
13-
"ROLLUP_CONTRACT_ADDRESS, ETHEREUM_HOST and NETWORK are required. Provided: ",
18+
"ROLLUP_CONTRACT_ADDRESS, ETHEREUM_HOSTS and NETWORK are required. Provided: ",
1419
ROLLUP_CONTRACT_ADDRESS,
15-
ETHEREUM_HOST,
16-
NETWORK
20+
ETHEREUM_HOSTS,
21+
NETWORK,
1722
);
1823
throw new Error(
19-
"ROLLUP_CONTRACT_ADDRESS, ETHEREUM_HOST and NETWORK are required"
24+
"ROLLUP_CONTRACT_ADDRESS, ETHEREUM_HOSTS and NETWORK are required",
2025
);
2126
}
2227

2328
if (!ROLLUP_CONTRACT_ADDRESS.startsWith("0x")) {
2429
throw new Error("ROLLUP_CONTRACT_ADDRESS must start with 0x");
2530
}
2631

27-
const transport = http(ETHEREUM_HOST);
32+
const RPC_TIMEOUT_MS = 12_000;
2833

29-
const publicClient = createPublicClient({
30-
transport,
31-
});
34+
const publicClientsByRpcUrl = new Map<
35+
string,
36+
ReturnType<typeof createPublicClient>
37+
>();
38+
39+
function getPublicClient(rpcUrl: string) {
40+
let c = publicClientsByRpcUrl.get(rpcUrl);
41+
if (!c) {
42+
c = createPublicClient({
43+
transport: http(rpcUrl, { timeout: RPC_TIMEOUT_MS }),
44+
});
45+
publicClientsByRpcUrl.set(rpcUrl, c);
46+
}
47+
return c;
48+
}
3249

3350
const ROLLUP_ABI = [
3451
{
@@ -74,23 +91,95 @@ const pendingCheckpointNumberGauge = new client.Gauge({
7491
labelNames: ["network"],
7592
});
7693

77-
async function updateCheckpointNumbers(): Promise<void> {
78-
try {
79-
const provenCheckpointNumber = await publicClient.readContract({
94+
const POLL_INTERVAL_MS = 36_000;
95+
96+
let lastStartedUpdateId = 0;
97+
98+
async function readCheckpointsFromRpc(
99+
rpcUrl: string,
100+
blockNumber: bigint,
101+
): Promise<{ proven: number; pending: number }> {
102+
const publicClient = getPublicClient(rpcUrl);
103+
const [provenCheckpointNumber, pendingCheckpointNumber] = await Promise.all([
104+
publicClient.readContract({
80105
address: ROLLUP_CONTRACT_ADDRESS as `0x${string}`,
81106
abi: ROLLUP_ABI,
82107
functionName: "getProvenCheckpointNumber",
83-
});
84-
provenCheckpointNumberGauge.set(Number(provenCheckpointNumber));
85-
86-
const pendingCheckpointNumber = await publicClient.readContract({
108+
blockNumber,
109+
}),
110+
publicClient.readContract({
87111
address: ROLLUP_CONTRACT_ADDRESS as `0x${string}`,
88112
abi: ROLLUP_ABI,
89113
functionName: "getPendingCheckpointNumber",
114+
blockNumber,
115+
}),
116+
]);
117+
return {
118+
proven: Number(provenCheckpointNumber),
119+
pending: Number(pendingCheckpointNumber),
120+
};
121+
}
122+
123+
async function updateCheckpointNumbers(): Promise<void> {
124+
const thisUpdateId = ++lastStartedUpdateId;
125+
const startedAt = Date.now();
126+
try {
127+
const blockNumber = await getPublicClient(
128+
ethereumRpcUrls[0]!,
129+
).getBlockNumber();
130+
const settled = await Promise.allSettled(
131+
ethereumRpcUrls.map((url) => readCheckpointsFromRpc(url, blockNumber)),
132+
);
133+
134+
if (thisUpdateId !== lastStartedUpdateId) {
135+
console.log("skipped stale checkpoint read", {
136+
updateId: thisUpdateId,
137+
latestUpdateId: lastStartedUpdateId,
138+
elapsedMs: Date.now() - startedAt,
139+
});
140+
return;
141+
}
142+
143+
const successes: { proven: number; pending: number }[] = [];
144+
const failures: { rpcUrl: string; reason: unknown }[] = [];
145+
for (let i = 0; i < settled.length; i++) {
146+
const r = settled[i]!;
147+
const rpcUrl = ethereumRpcUrls[i]!;
148+
if (r.status === "fulfilled") {
149+
successes.push(r.value);
150+
} else {
151+
failures.push({ rpcUrl, reason: r.reason });
152+
}
153+
}
154+
155+
if (successes.length === 0) {
156+
console.error(
157+
`checkpoint update failed: all ${ethereumRpcUrls.length} RPC host(s) failed (updateId=${thisUpdateId})`,
158+
failures,
159+
);
160+
return;
161+
}
162+
163+
const proven = Math.max(...successes.map((s) => s.proven));
164+
const pending = Math.max(...successes.map((s) => s.pending));
165+
provenCheckpointNumberGauge.set(proven);
166+
pendingCheckpointNumberGauge.set(pending);
167+
console.log("checkpoints updated", {
168+
updateId: thisUpdateId,
169+
proven,
170+
pending,
171+
rpcHostsOk: successes.length,
172+
rpcHostsFailed: failures.length,
173+
elapsedMs: Date.now() - startedAt,
90174
});
91-
pendingCheckpointNumberGauge.set(Number(pendingCheckpointNumber));
175+
if (failures.length > 0) {
176+
console.warn(
177+
`checkpoint read: ${failures.length} RPC host(s) failed; using max across ${successes.length} successful response(s)`,
178+
failures.map((f) => ({ rpcUrl: f.rpcUrl, reason: f.reason })),
179+
);
180+
}
92181
} catch (error) {
93-
console.error("Error updating checkpoint numbers:", error);
182+
console.error(`checkpoint update failed (updateId=${thisUpdateId})`, error);
94183
}
95184
}
96185

@@ -102,10 +191,16 @@ app.get("/metrics", async (_req: Request, res: Response) => {
102191

103192
const port = process.env.PORT ? Number(process.env.PORT) : 8080;
104193
app.listen(port, () => {
105-
console.log(`Metrics server listening on port ${port}`);
194+
console.log("metrics server listening", {
195+
port,
196+
network: NETWORK,
197+
rollup: ROLLUP_CONTRACT_ADDRESS,
198+
ethereumRpcUrls,
199+
pollIntervalMs: POLL_INTERVAL_MS,
200+
});
106201
});
107202

108-
setInterval(updateCheckpointNumbers, 36000);
203+
setInterval(updateCheckpointNumbers, POLL_INTERVAL_MS);
109204
updateCheckpointNumbers();
110205

111206
// Expose default process metrics, including process_start_time_seconds

spartan/metrics/irm-monitor/kubernetes/monitoring-deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ spec:
2323
containerPort: 8080
2424
protocol: TCP
2525
env:
26-
- name: ETHEREUM_HOST
26+
- name: ETHEREUM_HOSTS
2727
valueFrom:
2828
secretKeyRef:
2929
name: irm-monitor-secrets
30-
key: infura-sepolia-url
30+
key: ethereum-hosts
3131
- name: ROLLUP_CONTRACT_ADDRESS
3232
value: "${ROLLUP_CONTRACT_ADDRESS}"
3333
- name: NETWORK

spartan/metrics/irm-monitor/kubernetes/monitoring-secret.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ metadata:
44
name: irm-monitor-secrets
55
type: Opaque
66
data:
7-
infura-sepolia-url: ""
7+
ethereum-hosts: ""
88
grafana-cloud-password: ""

spartan/metrics/irm-monitor/scripts/update-monitoring.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ set -e
88
NAMESPACE=${1:-"testnet"}
99
MONITORING_NAMESPACE=${2:-"$NAMESPACE-irm"}
1010
NETWORK=${3:-"$NAMESPACE"}
11-
INFURA_URL_SECRET=${4:-"infura-sepolia-url"}
11+
ETHEREUM_HOSTS_SECRET=${4:-"irm-ethereum-hosts-sepolia"}
1212

1313
# Deployment name includes the monitoring namespace prefix
1414
export DEPLOYMENT_NAME="${MONITORING_NAMESPACE}-monitor"
@@ -130,7 +130,7 @@ fi
130130

131131
# Fetch GCP secrets
132132
echo "Fetching GCP secrets..."
133-
INFURA_URL=$(gcloud secrets versions access latest --secret=$INFURA_URL_SECRET)
133+
ETHEREUM_HOSTS=$(gcloud secrets versions access latest --secret=$ETHEREUM_HOSTS_SECRET)
134134
GRAFANA_PASSWORD=$(gcloud secrets versions access latest --secret=grafana-cloud-password)
135135

136136
# Ensure monitoring namespace exists
@@ -139,7 +139,7 @@ kubectl get ns "$MONITORING_NAMESPACE" >/dev/null 2>&1 || kubectl create ns "$MO
139139
# Create/update secrets
140140
echo "Applying monitoring secrets..."
141141
kubectl -n "$MONITORING_NAMESPACE" create secret generic irm-monitor-secrets \
142-
--from-literal=infura-sepolia-url="$INFURA_URL" \
142+
--from-literal=ethereum-hosts="$ETHEREUM_HOSTS" \
143143
--from-literal=grafana-cloud-password="$GRAFANA_PASSWORD" \
144144
--dry-run=client -o yaml | kubectl apply -f -
145145

0 commit comments

Comments
 (0)