Skip to content

Commit fe4c129

Browse files
samcmbarnabasbusa
andauthored
feat(otel): opt-in tracing to the engine OTel stack (#1405)
When `otel_tracing` is enabled, the package discovers the engine's `kurtosis otel` collector via the default gateway, fails fast if it isn't running, and injects per-client OTLP trace flags plus `OTEL_*` env vars (carrying enclave name/uuid resource attributes) so client traces land in ClickHouse tenanted by enclave. This supersedes the in-enclave ClickHouse + log-bridge approach in #1393. Depends on the engine-side `kurtosis otel` command: kurtosis-tech/kurtosis#3122. --------- Signed-off-by: Barnabas Busa <barnabas.busa@ethereum.org> Co-authored-by: Barnabas Busa <barnabas.busa@ethereum.org> Co-authored-by: Barnabas Busa <busa.barnabas@gmail.com>
1 parent 22685bd commit fe4c129

33 files changed

Lines changed: 672 additions & 100 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Optional features (enabled via flags or parameter files at runtime):
2929
- Specify the required parameters for the nodes to reach an external block building network
3030
- Generate keystores for each node in parallel
3131
- Spin up [TrueBlocks](https://github.com/TrueBlocks/trueblocks-core) (`chifra daemon`) to serve the chifra REST API on port 8080 (`/status`, `/blocks`, `/list`, `/chunks`, etc.). The scraper isn't started automatically; POST `/scrape` (or run `chifra scrape` against the same data dir) when you want to build the local [Unchained Index](https://trueblocks.io/docs/install/get-the-index/). Auto-tunes scrape parameters for devnets vs public networks.
32+
- Ship traces from every EL/CL/VC to the engine-level Kurtosis OTel stack started with `kurtosis otel start` by adding `otel` to `additional_services`. Traces land in the shared ClickHouse tenanted by enclave; requires the Docker backend.
3233

3334
## Quickstart
3435

@@ -1068,6 +1069,7 @@ additional_services:
10681069
- grafana
10691070
- mempool_bridge
10701071
- nginx
1072+
- otel
10711073
- prometheus
10721074
- rakoon
10731075
- slashoor

main.star

Lines changed: 277 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,245 @@ HTTP_PORT_ID_FOR_FACT = "http"
7777
MEV_BOOST_SHOULD_CHECK_RELAY = True
7878
PATH_TO_PARSED_BEACON_STATE = "/genesis/output/parsedBeaconState.json"
7979

80+
# Non-default ports so the engine OTel stack does not collide with a host-level
81+
# OTLP collector (4317/4318) or ClickHouse (8123). Must match `kurtosis otel start`.
82+
ENGINE_OTEL_OTLP_GRPC_PORT = 14317
83+
ENGINE_OTEL_OTLP_HTTP_PORT = 14318
84+
ENGINE_OTEL_CLICKHOUSE_HTTP_PORT = 18123
85+
ENGINE_OTEL_DISCOVERY_OUTPUT_FILE = "/tmp/engine-otel-discovery.json"
86+
ENGINE_OTEL_DISCOVERY_ARTIFACT_NAME = "engine-otel-discovery"
87+
ENGINE_OTEL_DISCOVERY_MOUNT_DIR = "/engine-otel-discovery"
88+
ENGINE_OTEL_DISCOVERY_SCRIPT_FILENAME = "engine-otel-discovery.sh"
89+
ENGINE_OTEL_DISCOVERY_SCRIPT_ARTIFACT_NAME = "engine-otel-discovery-script"
90+
ENGINE_OTEL_DISCOVERY_SCRIPT_MOUNT_DIR = "/engine-otel-discovery-script"
91+
92+
ENGINE_OTEL_DISCOVERY_SCRIPT = r"""#!/bin/sh
93+
set -eu
94+
95+
route_line=$(awk "\$2 == \"00000000\" { print \$1 \" \" \$3; exit }" /proc/net/route)
96+
if [ -z "$route_line" ]; then
97+
echo "default route not found" >&2
98+
exit 1
99+
fi
100+
101+
iface=$(printf "%s" "$route_line" | awk "{ print \$1 }")
102+
gateway_hex=$(printf "%s" "$route_line" | awk "{ print \$2 }")
103+
if [ -z "$gateway_hex" ]; then
104+
echo "default gateway not found" >&2
105+
exit 1
106+
fi
107+
108+
gateway=$(printf "%d.%d.%d.%d" \
109+
"0x$(printf "%s" "$gateway_hex" | cut -c7-8)" \
110+
"0x$(printf "%s" "$gateway_hex" | cut -c5-6)" \
111+
"0x$(printf "%s" "$gateway_hex" | cut -c3-4)" \
112+
"0x$(printf "%s" "$gateway_hex" | cut -c1-2)")
113+
114+
own_ip=""
115+
if command -v ip >/dev/null 2>&1; then
116+
own_ip=$(ip -4 -o addr show dev "$iface" scope global | awk "{ split(\$4, a, \"/\"); print a[1]; exit }")
117+
fi
118+
if [ -z "$own_ip" ]; then
119+
own_ip=$(hostname -i 2>/dev/null | tr " " "\n" | awk "split(\$1, a, \".\") == 4 && a[1] != \"127\" { print; exit }")
120+
fi
121+
if [ -z "$own_ip" ]; then
122+
echo "probe IPv4 not found" >&2
123+
exit 1
124+
fi
125+
126+
clickhouse_ping_url="http://${gateway}:{{ .ClickHousePort }}/ping"
127+
if ! curl -fsS "$clickhouse_ping_url" >/dev/null; then
128+
echo "engine OTel stack is not reachable at ${clickhouse_ping_url}; run 'kurtosis otel start' before adding 'otel' to additional_services" >&2
129+
exit 1
130+
fi
131+
132+
enclaves_json=$(curl -fsS -XPOST \
133+
-H "Content-Type: application/json" \
134+
-d "{}" \
135+
"http://${gateway}:9710/engine_api.EngineService/GetEnclaves")
136+
137+
cat > /tmp/engine-otel-discovery.jq <<\JQ
138+
def prefix16($ip):
139+
($ip | split(".")[0:2] | join("."));
140+
141+
def prefix22($ip):
142+
($ip | split(".")) as $octets
143+
| "\($octets[0]).\($octets[1]).\((($octets[2] | tonumber) / 4 | floor) * 4)";
144+
145+
(.enclaveInfo // {})
146+
| to_entries
147+
| map(.value | select(.apiContainerInfo.ipInsideEnclave != null))
148+
| (
149+
map(select(prefix22(.apiContainerInfo.ipInsideEnclave) == prefix22($own_ip))) as $matches22
150+
| if ($matches22 | length) == 1 then
151+
$matches22[0]
152+
else
153+
map(select(prefix16(.apiContainerInfo.ipInsideEnclave) == prefix16($own_ip))) as $matches16
154+
| if ($matches16 | length) == 1 then
155+
$matches16[0]
156+
else
157+
error("unable to identify enclave for probe IP \($own_ip)")
158+
end
159+
end
160+
)
161+
| {
162+
gateway: $gateway,
163+
enclave_uuid: .enclaveUuid,
164+
enclave_name: .name
165+
}
166+
JQ
167+
168+
printf "%s" "$enclaves_json" | jq -c \
169+
--arg gateway "$gateway" \
170+
--arg own_ip "$own_ip" \
171+
-f /tmp/engine-otel-discovery.jq > /tmp/engine-otel-discovery.json
172+
cat /tmp/engine-otel-discovery.json
173+
"""
174+
175+
176+
def new_engine_otel_endpoints(gateway=None, enclave_uuid=None, enclave_name=None):
177+
if gateway == None:
178+
return struct(
179+
gateway=None,
180+
enclave_uuid=None,
181+
enclave_name=None,
182+
resource_attributes=None,
183+
otlp_grpc_url=None,
184+
otlp_http_traces_url=None,
185+
clickhouse_host=None,
186+
clickhouse_port=None,
187+
)
188+
189+
return struct(
190+
gateway=gateway,
191+
enclave_uuid=enclave_uuid,
192+
enclave_name=enclave_name,
193+
resource_attributes="kurtosis.enclave.name={},kurtosis.enclave.uuid={}".format(
194+
enclave_name,
195+
enclave_uuid,
196+
),
197+
otlp_grpc_url="http://{}:{}".format(gateway, ENGINE_OTEL_OTLP_GRPC_PORT),
198+
otlp_http_traces_url="http://{}:{}/v1/traces".format(
199+
gateway,
200+
ENGINE_OTEL_OTLP_HTTP_PORT,
201+
),
202+
clickhouse_host=gateway,
203+
clickhouse_port=ENGINE_OTEL_CLICKHOUSE_HTTP_PORT,
204+
)
205+
206+
207+
def detect_engine_otel_endpoints(plan, global_tolerations, global_node_selectors):
208+
script_artifact = plan.render_templates(
209+
{
210+
ENGINE_OTEL_DISCOVERY_SCRIPT_FILENAME: shared_utils.new_template_and_data(
211+
ENGINE_OTEL_DISCOVERY_SCRIPT,
212+
{"ClickHousePort": ENGINE_OTEL_CLICKHOUSE_HTTP_PORT},
213+
),
214+
},
215+
name=ENGINE_OTEL_DISCOVERY_SCRIPT_ARTIFACT_NAME,
216+
)
217+
result = plan.run_sh(
218+
name="detect-engine-otel",
219+
description="Detecting enclave identity and engine OTel endpoints",
220+
run="/bin/sh {}/{}".format(
221+
ENGINE_OTEL_DISCOVERY_SCRIPT_MOUNT_DIR,
222+
ENGINE_OTEL_DISCOVERY_SCRIPT_FILENAME,
223+
),
224+
files={
225+
ENGINE_OTEL_DISCOVERY_SCRIPT_MOUNT_DIR: script_artifact,
226+
},
227+
store=[
228+
StoreSpec(
229+
src=ENGINE_OTEL_DISCOVERY_OUTPUT_FILE,
230+
name=ENGINE_OTEL_DISCOVERY_ARTIFACT_NAME,
231+
),
232+
],
233+
tolerations=shared_utils.get_tolerations(global_tolerations=global_tolerations),
234+
node_selectors=global_node_selectors,
235+
)
236+
discovery_artifact = result.files_artifacts[0]
237+
gateway = read_engine_otel_discovery_field(
238+
plan,
239+
discovery_artifact,
240+
"gateway",
241+
global_tolerations,
242+
global_node_selectors,
243+
)
244+
enclave_uuid = read_engine_otel_discovery_field(
245+
plan,
246+
discovery_artifact,
247+
"enclave_uuid",
248+
global_tolerations,
249+
global_node_selectors,
250+
)
251+
enclave_name = read_engine_otel_discovery_field(
252+
plan,
253+
discovery_artifact,
254+
"enclave_name",
255+
global_tolerations,
256+
global_node_selectors,
257+
)
258+
plan.print(
259+
"Using engine-level OTel collector via enclave gateway {} for enclave {} ({})".format(
260+
gateway,
261+
enclave_name,
262+
enclave_uuid,
263+
)
264+
)
265+
return new_engine_otel_endpoints(gateway, enclave_uuid, enclave_name)
266+
267+
268+
def read_engine_otel_discovery_field(
269+
plan,
270+
discovery_artifact,
271+
field,
272+
global_tolerations,
273+
global_node_selectors,
274+
):
275+
result = plan.run_sh(
276+
name="read-engine-otel-{}".format(field.replace("_", "-")),
277+
description="Reading engine OTel discovery field {}".format(field),
278+
run='value=$(jq -er ".{}" {}/engine-otel-discovery.json) && printf "%s" "$value"'.format(
279+
field,
280+
ENGINE_OTEL_DISCOVERY_MOUNT_DIR,
281+
),
282+
files={
283+
ENGINE_OTEL_DISCOVERY_MOUNT_DIR: discovery_artifact,
284+
},
285+
tolerations=shared_utils.get_tolerations(global_tolerations=global_tolerations),
286+
node_selectors=global_node_selectors,
287+
)
288+
return result.output
289+
290+
291+
def append_otel_resource_attributes(env_vars, resource_attributes):
292+
existing = env_vars.get("OTEL_RESOURCE_ATTRIBUTES", "")
293+
if existing == "":
294+
env_vars["OTEL_RESOURCE_ATTRIBUTES"] = resource_attributes
295+
elif resource_attributes not in existing:
296+
env_vars["OTEL_RESOURCE_ATTRIBUTES"] = "{},{}".format(
297+
existing,
298+
resource_attributes,
299+
)
300+
301+
302+
def add_otel_resource_attributes_to_participants(participants, resource_attributes):
303+
if resource_attributes == None:
304+
return
305+
for participant in participants:
306+
append_otel_resource_attributes(
307+
participant.el_extra_env_vars,
308+
resource_attributes,
309+
)
310+
append_otel_resource_attributes(
311+
participant.cl_extra_env_vars,
312+
resource_attributes,
313+
)
314+
append_otel_resource_attributes(
315+
participant.vc_extra_env_vars,
316+
resource_attributes,
317+
)
318+
80319

81320
def run(plan, args={}):
82321
"""Launches an arbitrarily complex ethereum testnet based on the arguments provided
@@ -90,8 +329,14 @@ def run(plan, args={}):
90329
num_participants = len(args_with_right_defaults.participants)
91330
network_params = args_with_right_defaults.network_params
92331

93-
# Detect the backend type early - needed for binary injection validation
94332
detected_backend = plan.get_cluster_type()
333+
otel_enabled = "otel" in args_with_right_defaults.additional_services
334+
if otel_enabled and detected_backend != "docker":
335+
fail(
336+
"The 'otel' additional_service requires the Docker backend because it uses the engine OTel stack published on the Docker host; detected backend: {}. Run with the Docker backend or remove 'otel' from additional_services.".format(
337+
detected_backend
338+
)
339+
)
95340

96341
if (
97342
"disruptoor" in args_with_right_defaults.additional_services
@@ -114,7 +359,6 @@ def run(plan, args={}):
114359
artifact = plan.render_templates(template_data, name + "_artifact")
115360
extra_files_artifacts[name] = artifact
116361

117-
# Validate binary injection - only supported with Docker backend
118362
for participant in args_with_right_defaults.participants:
119363
for bin_path in [
120364
participant.el_binary_path,
@@ -138,6 +382,22 @@ def run(plan, args={}):
138382
nginx_port = args_with_right_defaults.nginx_port
139383
docker_cache_params = args_with_right_defaults.docker_cache_params
140384

385+
engine_otel_endpoints = new_engine_otel_endpoints()
386+
if otel_enabled:
387+
engine_otel_endpoints = detect_engine_otel_endpoints(
388+
plan,
389+
global_tolerations,
390+
global_node_selectors,
391+
)
392+
add_otel_resource_attributes_to_participants(
393+
args_with_right_defaults.participants,
394+
engine_otel_endpoints.resource_attributes,
395+
)
396+
otel_clickhouse_host = engine_otel_endpoints.clickhouse_host
397+
otel_clickhouse_port = engine_otel_endpoints.clickhouse_port
398+
otel_otlp_grpc_url = engine_otel_endpoints.otlp_grpc_url
399+
otel_otlp_http_traces_url = engine_otel_endpoints.otlp_http_traces_url
400+
141401
for index, participant in enumerate(args_with_right_defaults.participants):
142402
if (
143403
num_participants == 1
@@ -284,6 +544,8 @@ def run(plan, args={}):
284544
parallel_keystore_generation,
285545
extra_files_artifacts,
286546
tempo_otlp_grpc_url,
547+
otel_otlp_grpc_url,
548+
otel_otlp_http_traces_url,
287549
detected_backend,
288550
)
289551

@@ -940,6 +1202,8 @@ def run(plan, args={}):
9401202
args_with_right_defaults.port_publisher,
9411203
index,
9421204
tempo_query_url,
1205+
otel_clickhouse_host,
1206+
otel_clickhouse_port,
9431207
)
9441208
plan.print("Successfully launched grafana")
9451209
elif additional_service == "tempo":
@@ -1097,6 +1361,10 @@ def run(plan, args={}):
10971361
args_with_right_defaults.docker_cache_params,
10981362
)
10991363
plan.print("Successfully launched trueblocks")
1364+
elif additional_service == "otel":
1365+
# Engine OTel reachability is enforced earlier via detect_engine_otel_endpoints();
1366+
# if discovery succeeded, the per-client OTLP env vars are already wired.
1367+
plan.print("OTel tracing wired to engine collector")
11001368
else:
11011369
fail("Invalid additional service %s" % (additional_service))
11021370
if launch_prometheus_grafana:
@@ -1128,6 +1396,8 @@ def run(plan, args={}):
11281396
args_with_right_defaults.port_publisher,
11291397
prometheus_grafana_index,
11301398
tempo_query_url,
1399+
otel_clickhouse_host,
1400+
otel_clickhouse_port,
11311401
)
11321402
plan.print("Successfully launched grafana")
11331403

@@ -1158,9 +1428,11 @@ def run(plan, args={}):
11581428

11591429
output = struct(
11601430
grafana_info=grafana_info,
1161-
blockscout_sc_verif_url=None
1162-
if ("blockscout" in args_with_right_defaults.additional_services) == False
1163-
else blockscout_sc_verif_url,
1431+
blockscout_sc_verif_url=(
1432+
None
1433+
if ("blockscout" in args_with_right_defaults.additional_services) == False
1434+
else blockscout_sc_verif_url
1435+
),
11641436
all_participants=all_participants,
11651437
pre_funded_accounts=prefunded_accounts,
11661438
network_params=network_params,

src/cl/caplin/caplin_launcher.star

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def launch(
4545
extra_files_artifacts,
4646
backend,
4747
tempo_otlp_grpc_url=None,
48+
otel_otlp_grpc_url=None,
4849
bootnode_enr_override=None,
4950
cl_binary_artifact=None,
5051
):
@@ -70,6 +71,7 @@ def launch(
7071
extra_files_artifacts,
7172
backend,
7273
tempo_otlp_grpc_url,
74+
otel_otlp_grpc_url,
7375
bootnode_enr_override,
7476
cl_binary_artifact,
7577
)
@@ -113,6 +115,7 @@ def get_beacon_config(
113115
extra_files_artifacts,
114116
backend,
115117
tempo_otlp_grpc_url,
118+
otel_otlp_grpc_url=None,
116119
bootnode_enr_override=None,
117120
cl_binary_artifact=None,
118121
):
@@ -240,7 +243,11 @@ def get_beacon_config(
240243
if cl_binary_artifact != None:
241244
files["/opt/bin"] = cl_binary_artifact.artifact
242245

243-
env_vars = participant.cl_extra_env_vars
246+
env_vars = shared_utils.with_otel_env_vars(
247+
participant.cl_extra_env_vars,
248+
otel_otlp_grpc_url,
249+
beacon_service_name,
250+
)
244251

245252
cmd_str = " ".join(cmd)
246253
if cl_binary_artifact != None:

0 commit comments

Comments
 (0)