Skip to content

Commit 4b67db7

Browse files
committed
HDDS-15552. Ratis events should not be published as metrics
Change-Id: I06634562bad09e1cf308b3d2f9ec93d4b6c078fe
1 parent cb29f19 commit 4b67db7

12 files changed

Lines changed: 134 additions & 6 deletions

File tree

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/placement/metrics/SCMMetrics.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,9 @@ public void addRatisEvent(String event) {
180180
}
181181
}
182182

183-
@Metric("Ratis state machine events")
183+
// Ratis state machine events are multi-line logs, which should not be
184+
// published as time-series metrics to metrics systems like Prometheus.
185+
// Instead, they are exposed via JMX / MXBean endpoints.
184186
public String getRatisEvents() {
185187
synchronized (ratisEvents) {
186188
return String.join("\n", ratisEvents);

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMMXBean.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,6 @@ public interface SCMMXBean extends ServiceRuntimeInfo {
8282
* @return the SCM hostname for the datanode.
8383
*/
8484
String getHostname();
85+
86+
String getRatisEvents();
8587
}

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,6 +2238,11 @@ public String getHostname() {
22382238
return scmHostName;
22392239
}
22402240

2241+
@Override
2242+
public String getRatisEvents() {
2243+
return metrics != null ? metrics.getRatisEvents() : "";
2244+
}
2245+
22412246
public Collection<String> getScmAdminUsernames() {
22422247
return scmAdmins.getAdminUsernames();
22432248
}

hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@
3030
templateUrl: 'ratis-events.html',
3131
controller: function ($http) {
3232
var ctrl = this;
33-
$http.get("jmx?qry=Hadoop:service=StorageContainerManager,name=SCMMetrics")
33+
$http.get("jmx?qry=Hadoop:service=StorageContainerManager,name=StorageContainerManagerInfo,component=ServerRuntime")
3434
.then(function (result) {
3535
var metrics = result.data.beans[0];
36-
var rawEvents = metrics['tag.RatisEvents'] ? metrics['tag.RatisEvents'].split('\n') : [];
36+
var rawEvents = (metrics && metrics['RatisEvents']) ? metrics['RatisEvents'].split('\n') : [];
3737
ctrl.events = rawEvents.map(function(e) {
3838
var parts = e.split('|');
3939
return {

hadoop-ozone/dist/src/main/compose/ozone/test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export COMPOSE_DIR
2424

2525
export SECURITY_ENABLED=false
2626
export OZONE_REPLICATION_FACTOR=3
27+
export COMPOSE_FILE=docker-compose.yaml:monitoring.yaml
2728

2829
# shellcheck source=/dev/null
2930
source "$COMPOSE_DIR/../testlib.sh"
@@ -40,6 +41,7 @@ execute_robot_test scm gdpr
4041
execute_robot_test scm security/ozone-secure-token.robot
4142

4243
execute_robot_test scm recon
44+
execute_robot_test scm prometheus
4345

4446
execute_robot_test scm om-ratis
4547

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
*** Settings ***
17+
Documentation Test Prometheus monitoring integration
18+
Library OperatingSystem
19+
Library BuiltIn
20+
Resource ../commonlib.robot
21+
22+
*** Test Cases ***
23+
Verify Prometheus targets are healthy
24+
Wait Until Keyword Succeeds 90sec 10sec Check Prometheus Targets Health
25+
26+
*** Keywords ***
27+
Check Prometheus Targets Health
28+
${result} = Execute python3 ${OZONE_DIR}/smoketest/prometheus/prometheus_check.py
29+
Should Contain ${result} Successfully verified
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python3
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import urllib.request
18+
import json
19+
import sys
20+
import socket
21+
22+
def is_running(host, port):
23+
try:
24+
with socket.create_connection((host, int(port)), timeout=2):
25+
return True
26+
except Exception:
27+
return False
28+
29+
def main():
30+
try:
31+
res = urllib.request.urlopen("http://prometheus:9090/api/v1/targets")
32+
data = json.loads(res.read().decode())
33+
targets = data.get("data", {}).get("activeTargets", [])
34+
if not targets:
35+
print("No active targets found in Prometheus")
36+
sys.exit(1)
37+
38+
failed = False
39+
checked = 0
40+
for t in targets:
41+
url = t.get("scrapeUrl", "")
42+
# scrapeUrl is like "http://scm:9876/prom"
43+
try:
44+
host_port = url.split("//")[1].split("/")[0]
45+
if ":" in host_port:
46+
host, port = host_port.split(":")
47+
else:
48+
host = host_port
49+
port = 80
50+
except Exception:
51+
continue
52+
53+
if is_running(host, port):
54+
checked += 1
55+
health = t.get("health", "")
56+
print(f"Target {host}:{port} is running. Prometheus health: {health}")
57+
if health != "up":
58+
print(f"Error: Target {host}:{port} is running but Prometheus health is '{health}'. Last error: {t.get('lastError')}")
59+
failed = True
60+
else:
61+
print(f"Target {host}:{port} is not running. Skipping check.")
62+
63+
if checked == 0:
64+
print("Error: No running targets were checked!")
65+
sys.exit(1)
66+
67+
if failed:
68+
sys.exit(1)
69+
70+
print(f"Successfully verified {checked} running targets.")
71+
except Exception as e:
72+
print(f"Exception during health check: {e}")
73+
sys.exit(1)
74+
75+
if __name__ == "__main__":
76+
main()

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestSCMMXBean.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ public void testSCMMXBean() throws Exception {
8686
double containerThreshold = (double) mbs.getAttribute(bean,
8787
"SafeModeCurrentContainerThreshold");
8888
assertEquals(scm.getCurrentContainerThreshold(), containerThreshold, 0);
89+
90+
String ratisEvents = (String) mbs.getAttribute(bean, "RatisEvents");
91+
assertEquals(scm.getMetrics().getRatisEvents(), ratisEvents);
8992
}
9093

9194
@Test

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMMXBean.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,6 @@ public interface OMMXBean extends ServiceRuntimeInfo {
4141
* @return the OM hostname for the datanode.
4242
*/
4343
String getHostname();
44+
45+
String getRatisEvents();
4446
}

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMMetrics.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1590,7 +1590,9 @@ public void addRatisEvent(String event) {
15901590
}
15911591
}
15921592

1593-
@Metric("Ratis state machine events")
1593+
// Ratis state machine events are multi-line logs, which should not be
1594+
// published as time-series metrics to metrics systems like Prometheus.
1595+
// Instead, they are exposed via JMX / MXBean endpoints.
15941596
public String getRatisEvents() {
15951597
synchronized (ratisEvents) {
15961598
return String.join("\n", ratisEvents);

0 commit comments

Comments
 (0)