Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public DataNodeSafeModeRule(EventQueue eventQueue,
requiredDns = conf.getInt(
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
getSafeModeMetrics().setNumRequiredDatanodesThreshold(requiredDns);
registeredDnSet = new HashSet<>(requiredDns * 2);
this.nodeManager = nodeManager;
}
Expand All @@ -71,9 +72,14 @@ protected boolean validate() {
@Override
protected void process(NodeRegistrationContainerReport reportsProto) {

registeredDnSet.add(reportsProto.getDatanodeDetails().getID());
DatanodeID dnId = reportsProto.getDatanodeDetails().getID();
boolean added = registeredDnSet.add(dnId);
registeredDns = registeredDnSet.size();

if (added) {
getSafeModeMetrics().incCurrentRegisteredDatanodesCount();
Comment thread
sreejasahithi marked this conversation as resolved.
}

if (scmInSafeMode()) {
SCMSafeModeManager.getLogger().info(
"SCM in safe mode. {} DataNodes registered, {} required.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ public SafeModeMetrics getSafeModeMetrics() {

private void emitSafeModeStatus() {
final SafeModeStatus safeModeStatus = status.get();
safeModeMetrics.setScmInSafeMode(safeModeStatus.isInSafeMode());
Comment thread
Tejaskriya marked this conversation as resolved.
scmContext.updateSafeModeStatus(safeModeStatus);

// notify SCMServiceManager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;

/**
Expand Down Expand Up @@ -52,6 +53,12 @@ public class SafeModeMetrics {
private @Metric MutableCounterLong
currentPipelinesWithAtleastOneReplicaReportedCount;

@Metric("Metric will be set to 1 if SCM is in SafeMode, otherwise 0")
private MutableGaugeInt scmInSafeMode;

@Metric private MutableGaugeLong numRequiredDatanodesThreshold;
@Metric private MutableCounterLong currentRegisteredDatanodesCount;
Comment thread
errose28 marked this conversation as resolved.

public static SafeModeMetrics create() {
final MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new SafeModeMetrics());
Expand Down Expand Up @@ -86,6 +93,14 @@ public void setNumContainerReportedThreshold(HddsProtos.ReplicationType type, lo
}
}

public void setScmInSafeMode(boolean inSafeMode) {
this.scmInSafeMode.set(inSafeMode ? 1 : 0);
}

public void setNumRequiredDatanodesThreshold(long val) {
this.numRequiredDatanodesThreshold.set(val);
}

public void incCurrentContainersWithOneReplicaReportedCount() {
this.currentContainersWithOneReplicaReportedCount.incr();
}
Expand All @@ -94,6 +109,10 @@ public void incCurrentContainersWithECDataReplicaReportedCount() {
this.currentContainersWithECDataReplicaReportedCount.incr();
}

public void incCurrentRegisteredDatanodesCount() {
this.currentRegisteredDatanodesCount.incr();
}

MutableGaugeLong getNumHealthyPipelinesThreshold() {
return numHealthyPipelinesThreshold;
}
Expand Down Expand Up @@ -122,6 +141,14 @@ MutableGaugeLong getNumContainerWithECDataReplicaReportedThreshold() {
MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() {
return currentContainersWithOneReplicaReportedCount;
}

MutableCounterLong getCurrentRegisteredDatanodesCount() {
return currentRegisteredDatanodesCount;
}

MutableGaugeInt getScmInSafeMode() {
return scmInSafeMode;
}

public void unRegister() {
MetricsSystem ms = DefaultMetricsSystem.instance();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.hadoop.hdds.scm.safemode;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
Expand Down Expand Up @@ -53,6 +54,7 @@ public class TestDataNodeSafeModeRule {
private EventQueue eventQueue;
private NodeManager nodeManager;
private SCMSafeModeManager mockSafeModeManager;
private SafeModeMetrics metrics;

private void setup(int requiredDns) throws Exception {
OzoneConfiguration ozoneConfiguration = new OzoneConfiguration();
Expand All @@ -65,6 +67,8 @@ private void setup(int requiredDns) throws Exception {
eventQueue = new EventQueue();

mockSafeModeManager = mock(SCMSafeModeManager.class);
metrics = SafeModeMetrics.create();
when(mockSafeModeManager.getSafeModeMetrics()).thenReturn(metrics);

rule = new DataNodeSafeModeRule(eventQueue, ozoneConfiguration, nodeManager, mockSafeModeManager);
assertNotNull(rule);
Expand Down Expand Up @@ -94,6 +98,7 @@ public void testDataNodeSafeModeRuleWithNoNodes() throws Exception {
"SCM in safe mode. 1 DataNodes registered, 1 required."), 1000, 5000);

assertTrue(rule.validate());
assertEquals(1, metrics.getCurrentRegisteredDatanodesCount().value());
}

@Test
Expand All @@ -120,7 +125,7 @@ public void testDataNodeSafeModeRuleWithMultipleNodes() throws Exception {
"SCM in safe mode. 2 DataNodes registered, 3 required."), 1000, 5000);

assertFalse(rule.validate());

assertEquals(2, metrics.getCurrentRegisteredDatanodesCount().value());
DatanodeDetails dd = MockDatanodeDetails.randomDatanodeDetails();
NodeRegistrationContainerReport nodeReg =
new NodeRegistrationContainerReport(dd, null);
Expand All @@ -131,6 +136,7 @@ public void testDataNodeSafeModeRuleWithMultipleNodes() throws Exception {
"SCM in safe mode. 3 DataNodes registered, 3 required."), 1000, 5000);

assertTrue(rule.validate());
assertEquals(3, metrics.getCurrentRegisteredDatanodesCount().value());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ public void setUp() throws IOException {

@AfterEach
public void destroyDbStore() throws Exception {
if (scmSafeModeManager != null) {
scmSafeModeManager.getSafeModeMetrics().unRegister();
}
if (scmMetadataStore.getStore() != null) {
scmMetadataStore.getStore().close();
}
Expand Down Expand Up @@ -136,6 +139,7 @@ private void testSafeMode(int numContainers) throws Exception {
scmSafeModeManager.start();

assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
validateRuleStatus("DatanodeSafeModeRule", "registered datanodes 0");
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(containers);
Expand All @@ -151,6 +155,9 @@ private void testSafeMode(int numContainers) throws Exception {

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);

assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
.getCurrentContainersWithOneReplicaReportedCount().value());
Expand Down Expand Up @@ -182,6 +189,7 @@ public void testSafeModeExitRule() throws Exception {
.getNumContainerWithOneReplicaReportedThreshold().value());

assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
validateRuleStatus("ContainerSafeModeRule",
"0.00% of [Ratis] Containers(0 / 100) with at least one reported");
testContainerThreshold(containers.subList(0, 25), 0.25);
Expand All @@ -202,6 +210,9 @@ public void testSafeModeExitRule() throws Exception {

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);
}

private OzoneConfiguration createConf(double healthyPercent,
Expand Down Expand Up @@ -306,6 +317,7 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck(
scmSafeModeManager.start();

assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
if (healthyPipelinePercent > 0) {
validateRuleStatus("HealthyPipelineSafeModeRule",
"healthy Ratis/THREE pipelines");
Expand Down Expand Up @@ -367,6 +379,9 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck(

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);
}

/**
Expand Down Expand Up @@ -477,8 +492,10 @@ public void testContainerSafeModeRule() throws Exception {

scmSafeModeManager = new SCMSafeModeManager(config, null, null,
containerManager, serviceManager, queue, scmContext);
scmSafeModeManager.start();

assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());

// When 10 CLOSED containers are reported by DNs, the computed container
// threshold should be 10/20 as there are only 20 CLOSED NON-EMPTY
Expand All @@ -494,6 +511,9 @@ public void testContainerSafeModeRule() throws Exception {

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);
}

// We simulate common EC types: EC-2-2-1024K, EC-3-2-1024K, EC-6-3-1024K.
Expand Down Expand Up @@ -584,6 +604,7 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception {

// Assert SCM is in Safe mode.
assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());

// Register all DataNodes except last one and assert SCM is in safe mode.
for (int i = 0; i < numOfDns - 1; i++) {
Expand All @@ -606,6 +627,9 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception {
HddsTestUtils.createNodeRegistrationContainerReport(containers));
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
10, 1000 * 10);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);
}

private void testContainerThreshold(List<ContainerInfo> dnContainers,
Expand Down Expand Up @@ -700,11 +724,15 @@ public void testSafeModePipelineExitRule() throws Exception {


assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());

firePipelineEvent(pipelineManager, pipeline);

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 10);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);
pipelineManager.close();
}

Expand Down Expand Up @@ -744,6 +772,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception {

// Assert SCM is in Safe mode.
assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());

// stop background pipeline creator as we manually create
// pipeline below
Expand Down Expand Up @@ -781,5 +810,8 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception {
queue.processAll(5000);
assertTrue(scmSafeModeManager.getPreCheckComplete());
assertFalse(scmSafeModeManager.getInSafeMode());
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);
}
}
Loading