diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java index e1cf9a45bf50..8a84ef180459 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java @@ -48,6 +48,8 @@ import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; import org.apache.hadoop.ozone.protocol.commands.RegisteredCommand; import org.apache.hadoop.ozone.protocol.commands.SCMCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A node manager supports a simple interface for managing a datanode. @@ -74,6 +76,8 @@ public interface NodeManager extends StorageContainerNodeProtocol, EventHandler, NodeManagerMXBean, Closeable { + Logger LOG = LoggerFactory.getLogger(NodeManager.class); + /** * Register API without a layout version info object passed in. Useful for * tests. @@ -144,6 +148,57 @@ default int getAllNodeCount() { return getAllNodes().size(); } + /** + * @return DatanodeFinalizationCounts, finalized and total healthy node counts + */ + default DatanodeFinalizationCounts getDatanodeFinalizationCounts() { + int finalizedNodes = 0; + int totalHealthyNodes = 0; + + for (DatanodeDetails dn : getAllNodes()) { + try { + // Only count HEALTHY nodes. STALE/DEAD nodes are intentionally excluded + // for the following reasons: + // - When a node goes STALE, its write pipelines are closed, so it + // cannot be involved in writes regardless of finalization state. + // - The ZDU write path is designed to handle datanodes at different + // layout versions, so an unfinalized STALE node does not block + // correctness if it later returns to HEALTHY. + // - If it recovers to HEALTHY, it will receive a finalize command on + // its next heartbeat and finalize quickly. If it is in bad shape, + // it will likely go DEAD and can be ignored. + if (!getNodeStatus(dn).isHealthy()) { + continue; + } + totalHealthyNodes++; + DatanodeInfo datanodeInfo = getDatanodeInfo(dn); + if (datanodeInfo == null) { + LOG.warn("Could not get DatanodeInfo for {}, skipping in " + + "finalization wait.", dn.getHostName()); + continue; + } + + LayoutVersionProto dnLayout = datanodeInfo.getLastKnownLayoutVersion(); + int dnMlv = dnLayout.getMetadataLayoutVersion(); + int dnSlv = dnLayout.getSoftwareLayoutVersion(); + + if (dnMlv < dnSlv) { + // Datanode has not yet finalized + LOG.debug("Datanode {} has not yet finalized: MLV={}, SLV={}", + dn.getHostName(), dnMlv, dnSlv); + } else { + finalizedNodes++; + } + } catch (NodeNotFoundException e) { + // Node was removed while we were iterating. This is OK, skip it. + LOG.debug("Node {} not found while waiting for finalization, " + + "skipping.", dn); + } + } + + return new DatanodeFinalizationCounts(finalizedNodes, totalHealthyNodes); + } + /** * Returns the aggregated node stats. * @return the aggregated node stats. @@ -420,4 +475,30 @@ default void removeNode(DatanodeDetails datanodeDetails) throws NodeNotFoundExce } int openContainerLimit(List datanodes); + + /** + * Class to store the number finalized and healthy datanodes. + */ + final class DatanodeFinalizationCounts { + private final int numFinalizedDatanodes; + private final int totalHealthyDatanodes; + + public DatanodeFinalizationCounts(int numFinalizedDatanodes, + int totalHealthyDatanodes) { + this.numFinalizedDatanodes = numFinalizedDatanodes; + this.totalHealthyDatanodes = totalHealthyDatanodes; + } + + public int getNumFinalizedDatanodes() { + return numFinalizedDatanodes; + } + + public int getTotalHealthyDatanodes() { + return totalHealthyDatanodes; + } + + public boolean allNodesFinalized() { + return numFinalizedDatanodes == totalHealthyDatanodes; + } + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index ca29b8c0c350..483d60c182af 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -92,6 +92,7 @@ import org.apache.hadoop.hdds.scm.ha.SCMRatisServerImpl; import org.apache.hadoop.hdds.scm.node.DatanodeInfo; import org.apache.hadoop.hdds.scm.node.DatanodeUsageInfo; +import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; @@ -1153,13 +1154,20 @@ public HddsProtos.UpgradeStatus queryUpgradeStatus() throws IOException { try { getScm().checkAdminAccess(getRemoteUser(), true); - // Returning a placeholder for now. + boolean scmFinalized = !scm.getLayoutVersionManager().needsFinalization(); + NodeManager.DatanodeFinalizationCounts datanodeFinalizationCounts = + scm.getScmNodeManager().getDatanodeFinalizationCounts(); + int finalizedDatanodes = datanodeFinalizationCounts.getNumFinalizedDatanodes(); + int healthyDatanodes = datanodeFinalizationCounts.getTotalHealthyDatanodes(); + boolean shouldFinalize = scmFinalized && datanodeFinalizationCounts.allNodesFinalized(); + HddsProtos.UpgradeStatus result = HddsProtos.UpgradeStatus.newBuilder() - .setScmFinalized(true) - .setNumDatanodesFinalized(10) - .setNumDatanodesTotal(10) - .setShouldFinalize(true) + .setScmFinalized(scmFinalized) + .setNumDatanodesFinalized(finalizedDatanodes) + .setNumDatanodesTotal(healthyDatanodes) + .setShouldFinalize(shouldFinalize) .build(); + AUDIT.logReadSuccess(buildAuditMessageForSuccess(SCMAction.QUERY_UPGRADE_STATUS, null)); return result; } catch (IOException ex) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/upgrade/SCMUpgradeFinalizer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/upgrade/SCMUpgradeFinalizer.java index 93f2c7dace88..7f11adb606b8 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/upgrade/SCMUpgradeFinalizer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/upgrade/SCMUpgradeFinalizer.java @@ -18,12 +18,8 @@ package org.apache.hadoop.hdds.scm.server.upgrade; import java.io.IOException; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.LayoutVersionProto; import org.apache.hadoop.hdds.scm.exceptions.SCMException; -import org.apache.hadoop.hdds.scm.node.DatanodeInfo; import org.apache.hadoop.hdds.scm.node.NodeManager; -import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.upgrade.HDDSLayoutFeature; import org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager; import org.apache.hadoop.ozone.upgrade.BasicUpgradeFinalizer; @@ -121,46 +117,13 @@ private void waitForDatanodesToFinalize(SCMUpgradeFinalizationContext context) // SCM is no longer the leader by throwing NotLeaderException. context.getSCMContext().getTermOfLeader(); - allDatanodesFinalized = true; - int totalHealthyNodes = 0; - int finalizedNodes = 0; - int unfinalizedNodes = 0; - - for (DatanodeDetails dn : nodeManager.getAllNodes()) { - try { - // Only check HEALTHY nodes. STALE/DEAD nodes will be told to - // finalize when they recover. - if (nodeManager.getNodeStatus(dn).isHealthy()) { - totalHealthyNodes++; - DatanodeInfo datanodeInfo = nodeManager.getDatanodeInfo(dn); - if (datanodeInfo == null) { - LOG.warn("Could not get DatanodeInfo for {}, skipping in " + - "finalization wait.", dn.getHostName()); - continue; - } - - LayoutVersionProto dnLayout = datanodeInfo.getLastKnownLayoutVersion(); - int dnMlv = dnLayout.getMetadataLayoutVersion(); - int dnSlv = dnLayout.getSoftwareLayoutVersion(); - - if (dnMlv < dnSlv) { - // Datanode has not yet finalized - allDatanodesFinalized = false; - unfinalizedNodes++; - LOG.debug("Datanode {} has not yet finalized: MLV={}, SLV={}", - dn.getHostName(), dnMlv, dnSlv); - } else { - finalizedNodes++; - } - } - } catch (NodeNotFoundException e) { - // Node was removed while we were iterating. This is OK, skip it. - LOG.debug("Node {} not found while waiting for finalization, " + - "skipping.", dn); - } - } + NodeManager.DatanodeFinalizationCounts datanodeFinalizationCounts = nodeManager.getDatanodeFinalizationCounts(); + int finalizedNodes = datanodeFinalizationCounts.getNumFinalizedDatanodes(); + int totalHealthyNodes = datanodeFinalizationCounts.getTotalHealthyDatanodes(); + allDatanodesFinalized = datanodeFinalizationCounts.allNodesFinalized(); if (!allDatanodesFinalized) { + int unfinalizedNodes = totalHealthyNodes - finalizedNodes; LOG.info("Waiting for datanodes to finalize. Status: {}/{} healthy " + "datanodes have finalized ({} remaining).", finalizedNodes, totalHealthyNodes, unfinalizedNodes); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java index a93fbd4aa99b..51c024599a07 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java @@ -737,6 +737,146 @@ public void testProcessLayoutVersion() throws IOException { testProcessLayoutVersionReportHigherMlv(); } + @Test + public void testDatanodeFinalizedCounterTracksLayoutVersionReports() + throws IOException, AuthenticationException { + try (SCMNodeManager nodeManager = createNodeManager(getConf())) { + DatanodeDetails node = + HddsTestUtils.createRandomDatanodeAndRegister(nodeManager); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts() + .getNumFinalizedDatanodes(), + "Initial datanode should be counted as finalized"); + + int softwareVersion = + nodeManager.getLayoutVersionManager().getSoftwareLayoutVersion(); + int metadataVersion = + nodeManager.getLayoutVersionManager().getMetadataLayoutVersion(); + nodeManager.processLayoutVersionReport(node, + LayoutVersionProto.newBuilder() + .setMetadataLayoutVersion(metadataVersion - 1) + .setSoftwareLayoutVersion(softwareVersion) + .build()); + assertEquals(0, nodeManager.getDatanodeFinalizationCounts() + .getNumFinalizedDatanodes(), + "Lower metadata layout version should decrement finalized count"); + + nodeManager.processLayoutVersionReport(node, + LayoutVersionProto.newBuilder() + .setMetadataLayoutVersion(metadataVersion) + .setSoftwareLayoutVersion(softwareVersion) + .build()); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts() + .getNumFinalizedDatanodes(), + "Restored metadata layout version should restore finalized count"); + } + } + + @Test + public void testDatanodeFinalizedCounterTracksRegistrationAndRemoveNode() + throws IOException, AuthenticationException, NodeNotFoundException { + try (SCMNodeManager nodeManager = createNodeManager(getConf())) { + DatanodeDetails finalizedNode = + registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts() + .getNumFinalizedDatanodes(), + "Finalized registration should increment finalized count"); + + DatanodeDetails nonFinalizedNode = + registerWithCapacity(nodeManager, SMALLER_MLV_LAYOUT_PROTO, success); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts() + .getNumFinalizedDatanodes(), + "Non-finalized registration should not increment finalized count"); + + nonFinalizedNode.setPersistedOpState( + HddsProtos.NodeOperationalState.DECOMMISSIONED); + nodeManager.removeNode(nonFinalizedNode); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts() + .getNumFinalizedDatanodes(), + "Removing a non-finalized node should not change finalized count"); + + finalizedNode.setPersistedOpState( + HddsProtos.NodeOperationalState.DECOMMISSIONED); + nodeManager.removeNode(finalizedNode); + assertEquals(0, nodeManager.getDatanodeFinalizationCounts().getNumFinalizedDatanodes(), + "Removing a finalized node should decrement finalized count"); + } + } + + private static Stream ineligibleHealthStates() { + return Stream.of( + Arguments.of(NodeStatus.inServiceStale()), + Arguments.of(NodeStatus.inServiceDead()) + ); + } + + @ParameterizedTest + @MethodSource("ineligibleHealthStates") + public void testDatanodeFinalizedCounterExcludesNonHealthyNodes(NodeStatus expectedStatus) + throws IOException, AuthenticationException, NodeNotFoundException, InterruptedException { + OzoneConfiguration conf = getConf(); + conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL, 100, MILLISECONDS); + conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 1, SECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, SECONDS); + conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 6, SECONDS); + + try (SCMNodeManager nodeManager = createNodeManager(conf)) { + // transitionNode stops heartbeating and will become STALE or DEAD + DatanodeDetails transitionNode = + registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success); + // heartbeatingNode keeps heartbeating as a healthy baseline + DatanodeDetails heartbeatingNode = + registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success); + + nodeManager.processHeartbeat(transitionNode); + nodeManager.processHeartbeat(heartbeatingNode); + + assertEquals(2, nodeManager.getDatanodeFinalizationCounts().getTotalHealthyDatanodes(), + "Both nodes should start as healthy"); + + // Only heartbeat the baseline node until transitionNode reaches the expected state. + // STALE requires > 3s (wait 4s), DEAD requires > 6s (wait 7s total). + boolean waitForDead = expectedStatus.equals(NodeStatus.inServiceDead()); + Thread.sleep(2000); + nodeManager.processHeartbeat(heartbeatingNode); + Thread.sleep(2000); + if (waitForDead) { + nodeManager.processHeartbeat(heartbeatingNode); + Thread.sleep(3000); + } + + assertEquals(expectedStatus, nodeManager.getNodeStatus(transitionNode), + "Node should have transitioned to " + expectedStatus); + + assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getTotalHealthyDatanodes(), + expectedStatus + " node should be excluded from total count"); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getNumFinalizedDatanodes(), + expectedStatus + " node should be excluded from finalized count"); + } + } + + private static Stream allOperationalStates() { + return Stream.of(HddsProtos.NodeOperationalState.values()) + .map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("allOperationalStates") + public void testDatanodeFinalizedCounterIncludesAllHealthyOpStates( + HddsProtos.NodeOperationalState opState) + throws IOException, AuthenticationException, NodeNotFoundException { + try (SCMNodeManager nodeManager = createNodeManager(getConf())) { + DatanodeDetails node = + registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success); + nodeManager.setNodeOperationalState(node, opState); + + // All HEALTHY nodes should be counted regardless of operational state + assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getTotalHealthyDatanodes(), + "HEALTHY node with op state " + opState + " should be counted in total"); + assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getNumFinalizedDatanodes(), + "HEALTHY finalized node with op state " + opState + " should be counted as finalized"); + } + } + // Currently invoked by testProcessLayoutVersion. public void testProcessLayoutVersionReportHigherMlv() throws IOException { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMClientProtocolServer.java index 9402218014ce..412dc635116a 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMClientProtocolServer.java @@ -155,6 +155,18 @@ private StorageContainerManager mockStorageContainerManager() { return storageContainerManager; } + @Test + public void testQueryUpgradeStatus() throws Exception { + HddsProtos.UpgradeStatus status = server.queryUpgradeStatus(); + + // SCM starts already finalized in tests + assertTrue(status.getScmFinalized()); + // No datanodes registered + assertEquals(0, status.getNumDatanodesFinalized()); + assertEquals(0, status.getNumDatanodesTotal()); + assertTrue(status.getShouldFinalize()); + } + private ContainerInfo newContainerInfoForTest() { return new ContainerInfo.Builder() .setContainerID(1) diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/upgrade/StatusSubCommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/upgrade/StatusSubCommand.java index 246887edde9c..b2654e5203d7 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/upgrade/StatusSubCommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/upgrade/StatusSubCommand.java @@ -39,8 +39,7 @@ public class StatusSubCommand extends ScmSubcommand { public void execute(ScmClient client) throws IOException { HddsProtos.UpgradeStatus status = client.queryUpgradeStatus(); - // Temporary output to validate the command is working. - out().println("Update status:"); + out().println("Upgrade status:"); out().println(" SCM Finalized: " + status.getScmFinalized()); out().println(" Datanodes finalized: " + status.getNumDatanodesFinalized()); out().println(" Total Datanodes: " + status.getNumDatanodesTotal()); diff --git a/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/ozone/admin/upgrade/TestStatusSubCommand.java b/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/ozone/admin/upgrade/TestStatusSubCommand.java new file mode 100644 index 000000000000..d0e0a439d1a0 --- /dev/null +++ b/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/ozone/admin/upgrade/TestStatusSubCommand.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.admin.upgrade; + +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import picocli.CommandLine; + +/** + * Unit tests for {@link StatusSubCommand}. + */ +public class TestStatusSubCommand { + + private static final String DEFAULT_ENCODING = StandardCharsets.UTF_8.name(); + + private final ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + private final PrintStream originalOut = System.out; + private StatusSubCommand cmd; + + @BeforeEach + public void setup() throws UnsupportedEncodingException { + cmd = new StatusSubCommand(); + System.setOut(new PrintStream(outContent, false, DEFAULT_ENCODING)); + } + + @AfterEach + public void tearDown() { + System.setOut(originalOut); + } + + @Test + public void testStatusCommandPrintsUpgradeStatus() throws IOException { + ScmClient scmClient = mock(ScmClient.class); + HddsProtos.UpgradeStatus status = HddsProtos.UpgradeStatus.newBuilder() + .setScmFinalized(false) + .setNumDatanodesFinalized(1) + .setNumDatanodesTotal(3) + .setShouldFinalize(true) + .build(); + when(scmClient.queryUpgradeStatus()).thenReturn(status); + + new CommandLine(cmd).parseArgs(); + cmd.execute(scmClient); + + String output = outContent.toString(DEFAULT_ENCODING); + assertTrue(output.contains("Upgrade status:")); + assertTrue(output.contains("SCM Finalized: false")); + assertTrue(output.contains("Datanodes finalized: 1")); + assertTrue(output.contains("Total Datanodes: 3")); + assertTrue(output.contains("Should Finalize: true")); + verify(scmClient).queryUpgradeStatus(); + } +}