Skip to content

Commit 7e4e5f3

Browse files
HDDS-14039. Create Grafana dashboard for Ozone SCM safemode rules and exit (#9400)
1 parent d71ab1f commit 7e4e5f3

6 files changed

Lines changed: 840 additions & 2 deletions

File tree

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public DataNodeSafeModeRule(EventQueue eventQueue,
5151
requiredDns = conf.getInt(
5252
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
5353
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
54+
getSafeModeMetrics().setNumRequiredDatanodesThreshold(requiredDns);
5455
registeredDnSet = new HashSet<>(requiredDns * 2);
5556
this.nodeManager = nodeManager;
5657
}
@@ -71,9 +72,14 @@ protected boolean validate() {
7172
@Override
7273
protected void process(NodeRegistrationContainerReport reportsProto) {
7374

74-
registeredDnSet.add(reportsProto.getDatanodeDetails().getID());
75+
DatanodeID dnId = reportsProto.getDatanodeDetails().getID();
76+
boolean added = registeredDnSet.add(dnId);
7577
registeredDns = registeredDnSet.size();
7678

79+
if (added) {
80+
getSafeModeMetrics().incCurrentRegisteredDatanodesCount();
81+
}
82+
7783
if (scmInSafeMode()) {
7884
SCMSafeModeManager.getLogger().info(
7985
"SCM in safe mode. {} DataNodes registered, {} required.",

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ public SafeModeMetrics getSafeModeMetrics() {
119119

120120
private void emitSafeModeStatus() {
121121
final SafeModeStatus safeModeStatus = status.get();
122+
safeModeMetrics.setScmInSafeMode(safeModeStatus.isInSafeMode());
122123
scmContext.updateSafeModeStatus(safeModeStatus);
123124

124125
// notify SCMServiceManager

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.hadoop.metrics2.annotation.Metric;
2323
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
2424
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
25+
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
2526
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
2627

2728
/**
@@ -52,6 +53,12 @@ public class SafeModeMetrics {
5253
private @Metric MutableCounterLong
5354
currentPipelinesWithAtleastOneReplicaReportedCount;
5455

56+
@Metric("Metric will be set to 1 if SCM is in SafeMode, otherwise 0")
57+
private MutableGaugeInt scmInSafeMode;
58+
59+
@Metric private MutableGaugeLong numRequiredDatanodesThreshold;
60+
@Metric private MutableCounterLong currentRegisteredDatanodesCount;
61+
5562
public static SafeModeMetrics create() {
5663
final MetricsSystem ms = DefaultMetricsSystem.instance();
5764
return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new SafeModeMetrics());
@@ -86,6 +93,14 @@ public void setNumContainerReportedThreshold(HddsProtos.ReplicationType type, lo
8693
}
8794
}
8895

96+
public void setScmInSafeMode(boolean inSafeMode) {
97+
this.scmInSafeMode.set(inSafeMode ? 1 : 0);
98+
}
99+
100+
public void setNumRequiredDatanodesThreshold(long val) {
101+
this.numRequiredDatanodesThreshold.set(val);
102+
}
103+
89104
public void incCurrentContainersWithOneReplicaReportedCount() {
90105
this.currentContainersWithOneReplicaReportedCount.incr();
91106
}
@@ -94,6 +109,10 @@ public void incCurrentContainersWithECDataReplicaReportedCount() {
94109
this.currentContainersWithECDataReplicaReportedCount.incr();
95110
}
96111

112+
public void incCurrentRegisteredDatanodesCount() {
113+
this.currentRegisteredDatanodesCount.incr();
114+
}
115+
97116
MutableGaugeLong getNumHealthyPipelinesThreshold() {
98117
return numHealthyPipelinesThreshold;
99118
}
@@ -122,6 +141,14 @@ MutableGaugeLong getNumContainerWithECDataReplicaReportedThreshold() {
122141
MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() {
123142
return currentContainersWithOneReplicaReportedCount;
124143
}
144+
145+
MutableCounterLong getCurrentRegisteredDatanodesCount() {
146+
return currentRegisteredDatanodesCount;
147+
}
148+
149+
MutableGaugeInt getScmInSafeMode() {
150+
return scmInSafeMode;
151+
}
125152

126153
public void unRegister() {
127154
MetricsSystem ms = DefaultMetricsSystem.instance();

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.hadoop.hdds.scm.safemode;
1919

20+
import static org.junit.jupiter.api.Assertions.assertEquals;
2021
import static org.junit.jupiter.api.Assertions.assertFalse;
2122
import static org.junit.jupiter.api.Assertions.assertNotNull;
2223
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -53,6 +54,7 @@ public class TestDataNodeSafeModeRule {
5354
private EventQueue eventQueue;
5455
private NodeManager nodeManager;
5556
private SCMSafeModeManager mockSafeModeManager;
57+
private SafeModeMetrics metrics;
5658

5759
private void setup(int requiredDns) throws Exception {
5860
OzoneConfiguration ozoneConfiguration = new OzoneConfiguration();
@@ -65,6 +67,8 @@ private void setup(int requiredDns) throws Exception {
6567
eventQueue = new EventQueue();
6668

6769
mockSafeModeManager = mock(SCMSafeModeManager.class);
70+
metrics = SafeModeMetrics.create();
71+
when(mockSafeModeManager.getSafeModeMetrics()).thenReturn(metrics);
6872

6973
rule = new DataNodeSafeModeRule(eventQueue, ozoneConfiguration, nodeManager, mockSafeModeManager);
7074
assertNotNull(rule);
@@ -94,6 +98,7 @@ public void testDataNodeSafeModeRuleWithNoNodes() throws Exception {
9498
"SCM in safe mode. 1 DataNodes registered, 1 required."), 1000, 5000);
9599

96100
assertTrue(rule.validate());
101+
assertEquals(1, metrics.getCurrentRegisteredDatanodesCount().value());
97102
}
98103

99104
@Test
@@ -120,7 +125,7 @@ public void testDataNodeSafeModeRuleWithMultipleNodes() throws Exception {
120125
"SCM in safe mode. 2 DataNodes registered, 3 required."), 1000, 5000);
121126

122127
assertFalse(rule.validate());
123-
128+
assertEquals(2, metrics.getCurrentRegisteredDatanodesCount().value());
124129
DatanodeDetails dd = MockDatanodeDetails.randomDatanodeDetails();
125130
NodeRegistrationContainerReport nodeReg =
126131
new NodeRegistrationContainerReport(dd, null);
@@ -131,6 +136,7 @@ public void testDataNodeSafeModeRuleWithMultipleNodes() throws Exception {
131136
"SCM in safe mode. 3 DataNodes registered, 3 required."), 1000, 5000);
132137

133138
assertTrue(rule.validate());
139+
assertEquals(3, metrics.getCurrentRegisteredDatanodesCount().value());
134140
}
135141

136142
@Test

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ public void setUp() throws IOException {
109109

110110
@AfterEach
111111
public void destroyDbStore() throws Exception {
112+
if (scmSafeModeManager != null) {
113+
scmSafeModeManager.getSafeModeMetrics().unRegister();
114+
}
112115
if (scmMetadataStore.getStore() != null) {
113116
scmMetadataStore.getStore().close();
114117
}
@@ -136,6 +139,7 @@ private void testSafeMode(int numContainers) throws Exception {
136139
scmSafeModeManager.start();
137140

138141
assertTrue(scmSafeModeManager.getInSafeMode());
142+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
139143
validateRuleStatus("DatanodeSafeModeRule", "registered datanodes 0");
140144
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
141145
HddsTestUtils.createNodeRegistrationContainerReport(containers);
@@ -151,6 +155,9 @@ private void testSafeMode(int numContainers) throws Exception {
151155

152156
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
153157
100, 1000 * 5);
158+
GenericTestUtils.waitFor(() ->
159+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
160+
100, 1000 * 5);
154161

155162
assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
156163
.getCurrentContainersWithOneReplicaReportedCount().value());
@@ -182,6 +189,7 @@ public void testSafeModeExitRule() throws Exception {
182189
.getNumContainerWithOneReplicaReportedThreshold().value());
183190

184191
assertTrue(scmSafeModeManager.getInSafeMode());
192+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
185193
validateRuleStatus("ContainerSafeModeRule",
186194
"0.00% of [Ratis] Containers(0 / 100) with at least one reported");
187195
testContainerThreshold(containers.subList(0, 25), 0.25);
@@ -202,6 +210,9 @@ public void testSafeModeExitRule() throws Exception {
202210

203211
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
204212
100, 1000 * 5);
213+
GenericTestUtils.waitFor(() ->
214+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
215+
100, 1000 * 5);
205216
}
206217

207218
private OzoneConfiguration createConf(double healthyPercent,
@@ -306,6 +317,7 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck(
306317
scmSafeModeManager.start();
307318

308319
assertTrue(scmSafeModeManager.getInSafeMode());
320+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
309321
if (healthyPipelinePercent > 0) {
310322
validateRuleStatus("HealthyPipelineSafeModeRule",
311323
"healthy Ratis/THREE pipelines");
@@ -367,6 +379,9 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck(
367379

368380
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
369381
100, 1000 * 5);
382+
GenericTestUtils.waitFor(() ->
383+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
384+
100, 1000 * 5);
370385
}
371386

372387
/**
@@ -477,8 +492,10 @@ public void testContainerSafeModeRule() throws Exception {
477492

478493
scmSafeModeManager = new SCMSafeModeManager(config, null, null,
479494
containerManager, serviceManager, queue, scmContext);
495+
scmSafeModeManager.start();
480496

481497
assertTrue(scmSafeModeManager.getInSafeMode());
498+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
482499

483500
// When 10 CLOSED containers are reported by DNs, the computed container
484501
// threshold should be 10/20 as there are only 20 CLOSED NON-EMPTY
@@ -494,6 +511,9 @@ public void testContainerSafeModeRule() throws Exception {
494511

495512
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
496513
100, 1000 * 5);
514+
GenericTestUtils.waitFor(() ->
515+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
516+
100, 1000 * 5);
497517
}
498518

499519
// We simulate common EC types: EC-2-2-1024K, EC-3-2-1024K, EC-6-3-1024K.
@@ -584,6 +604,7 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception {
584604

585605
// Assert SCM is in Safe mode.
586606
assertTrue(scmSafeModeManager.getInSafeMode());
607+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
587608

588609
// Register all DataNodes except last one and assert SCM is in safe mode.
589610
for (int i = 0; i < numOfDns - 1; i++) {
@@ -606,6 +627,9 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception {
606627
HddsTestUtils.createNodeRegistrationContainerReport(containers));
607628
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
608629
10, 1000 * 10);
630+
GenericTestUtils.waitFor(() ->
631+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
632+
100, 1000 * 5);
609633
}
610634

611635
private void testContainerThreshold(List<ContainerInfo> dnContainers,
@@ -700,11 +724,15 @@ public void testSafeModePipelineExitRule() throws Exception {
700724

701725

702726
assertTrue(scmSafeModeManager.getInSafeMode());
727+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
703728

704729
firePipelineEvent(pipelineManager, pipeline);
705730

706731
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
707732
100, 1000 * 10);
733+
GenericTestUtils.waitFor(() ->
734+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
735+
100, 1000 * 5);
708736
pipelineManager.close();
709737
}
710738

@@ -744,6 +772,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception {
744772

745773
// Assert SCM is in Safe mode.
746774
assertTrue(scmSafeModeManager.getInSafeMode());
775+
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
747776

748777
// stop background pipeline creator as we manually create
749778
// pipeline below
@@ -781,5 +810,8 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception {
781810
queue.processAll(5000);
782811
assertTrue(scmSafeModeManager.getPreCheckComplete());
783812
assertFalse(scmSafeModeManager.getInSafeMode());
813+
GenericTestUtils.waitFor(() ->
814+
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
815+
100, 1000 * 5);
784816
}
785817
}

0 commit comments

Comments
 (0)