Skip to content

Commit db2219f

Browse files
authored
HDDS-15034. Query SCM status for ozone admin upgrade status command (#10084)
1 parent fca46ec commit db2219f

7 files changed

Lines changed: 333 additions & 49 deletions

File tree

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
4949
import org.apache.hadoop.ozone.protocol.commands.RegisteredCommand;
5050
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
51+
import org.slf4j.Logger;
52+
import org.slf4j.LoggerFactory;
5153

5254
/**
5355
* A node manager supports a simple interface for managing a datanode.
@@ -74,6 +76,8 @@
7476
public interface NodeManager extends StorageContainerNodeProtocol,
7577
EventHandler<CommandForDatanode>, NodeManagerMXBean, Closeable {
7678

79+
Logger LOG = LoggerFactory.getLogger(NodeManager.class);
80+
7781
/**
7882
* Register API without a layout version info object passed in. Useful for
7983
* tests.
@@ -144,6 +148,57 @@ default int getAllNodeCount() {
144148
return getAllNodes().size();
145149
}
146150

151+
/**
152+
* @return DatanodeFinalizationCounts, finalized and total healthy node counts
153+
*/
154+
default DatanodeFinalizationCounts getDatanodeFinalizationCounts() {
155+
int finalizedNodes = 0;
156+
int totalHealthyNodes = 0;
157+
158+
for (DatanodeDetails dn : getAllNodes()) {
159+
try {
160+
// Only count HEALTHY nodes. STALE/DEAD nodes are intentionally excluded
161+
// for the following reasons:
162+
// - When a node goes STALE, its write pipelines are closed, so it
163+
// cannot be involved in writes regardless of finalization state.
164+
// - The ZDU write path is designed to handle datanodes at different
165+
// layout versions, so an unfinalized STALE node does not block
166+
// correctness if it later returns to HEALTHY.
167+
// - If it recovers to HEALTHY, it will receive a finalize command on
168+
// its next heartbeat and finalize quickly. If it is in bad shape,
169+
// it will likely go DEAD and can be ignored.
170+
if (!getNodeStatus(dn).isHealthy()) {
171+
continue;
172+
}
173+
totalHealthyNodes++;
174+
DatanodeInfo datanodeInfo = getDatanodeInfo(dn);
175+
if (datanodeInfo == null) {
176+
LOG.warn("Could not get DatanodeInfo for {}, skipping in " +
177+
"finalization wait.", dn.getHostName());
178+
continue;
179+
}
180+
181+
LayoutVersionProto dnLayout = datanodeInfo.getLastKnownLayoutVersion();
182+
int dnMlv = dnLayout.getMetadataLayoutVersion();
183+
int dnSlv = dnLayout.getSoftwareLayoutVersion();
184+
185+
if (dnMlv < dnSlv) {
186+
// Datanode has not yet finalized
187+
LOG.debug("Datanode {} has not yet finalized: MLV={}, SLV={}",
188+
dn.getHostName(), dnMlv, dnSlv);
189+
} else {
190+
finalizedNodes++;
191+
}
192+
} catch (NodeNotFoundException e) {
193+
// Node was removed while we were iterating. This is OK, skip it.
194+
LOG.debug("Node {} not found while waiting for finalization, " +
195+
"skipping.", dn);
196+
}
197+
}
198+
199+
return new DatanodeFinalizationCounts(finalizedNodes, totalHealthyNodes);
200+
}
201+
147202
/**
148203
* Returns the aggregated node stats.
149204
* @return the aggregated node stats.
@@ -420,4 +475,30 @@ default void removeNode(DatanodeDetails datanodeDetails) throws NodeNotFoundExce
420475
}
421476

422477
int openContainerLimit(List<DatanodeDetails> datanodes);
478+
479+
/**
480+
* Class to store the number finalized and healthy datanodes.
481+
*/
482+
final class DatanodeFinalizationCounts {
483+
private final int numFinalizedDatanodes;
484+
private final int totalHealthyDatanodes;
485+
486+
public DatanodeFinalizationCounts(int numFinalizedDatanodes,
487+
int totalHealthyDatanodes) {
488+
this.numFinalizedDatanodes = numFinalizedDatanodes;
489+
this.totalHealthyDatanodes = totalHealthyDatanodes;
490+
}
491+
492+
public int getNumFinalizedDatanodes() {
493+
return numFinalizedDatanodes;
494+
}
495+
496+
public int getTotalHealthyDatanodes() {
497+
return totalHealthyDatanodes;
498+
}
499+
500+
public boolean allNodesFinalized() {
501+
return numFinalizedDatanodes == totalHealthyDatanodes;
502+
}
503+
}
423504
}

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
import org.apache.hadoop.hdds.scm.ha.SCMRatisServerImpl;
9393
import org.apache.hadoop.hdds.scm.node.DatanodeInfo;
9494
import org.apache.hadoop.hdds.scm.node.DatanodeUsageInfo;
95+
import org.apache.hadoop.hdds.scm.node.NodeManager;
9596
import org.apache.hadoop.hdds.scm.node.NodeStatus;
9697
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
9798
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
@@ -1153,13 +1154,20 @@ public HddsProtos.UpgradeStatus queryUpgradeStatus() throws IOException {
11531154
try {
11541155
getScm().checkAdminAccess(getRemoteUser(), true);
11551156

1156-
// Returning a placeholder for now.
1157+
boolean scmFinalized = !scm.getLayoutVersionManager().needsFinalization();
1158+
NodeManager.DatanodeFinalizationCounts datanodeFinalizationCounts =
1159+
scm.getScmNodeManager().getDatanodeFinalizationCounts();
1160+
int finalizedDatanodes = datanodeFinalizationCounts.getNumFinalizedDatanodes();
1161+
int healthyDatanodes = datanodeFinalizationCounts.getTotalHealthyDatanodes();
1162+
boolean shouldFinalize = scmFinalized && datanodeFinalizationCounts.allNodesFinalized();
1163+
11571164
HddsProtos.UpgradeStatus result = HddsProtos.UpgradeStatus.newBuilder()
1158-
.setScmFinalized(true)
1159-
.setNumDatanodesFinalized(10)
1160-
.setNumDatanodesTotal(10)
1161-
.setShouldFinalize(true)
1165+
.setScmFinalized(scmFinalized)
1166+
.setNumDatanodesFinalized(finalizedDatanodes)
1167+
.setNumDatanodesTotal(healthyDatanodes)
1168+
.setShouldFinalize(shouldFinalize)
11621169
.build();
1170+
11631171
AUDIT.logReadSuccess(buildAuditMessageForSuccess(SCMAction.QUERY_UPGRADE_STATUS, null));
11641172
return result;
11651173
} catch (IOException ex) {

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/upgrade/SCMUpgradeFinalizer.java

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818
package org.apache.hadoop.hdds.scm.server.upgrade;
1919

2020
import java.io.IOException;
21-
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
22-
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.LayoutVersionProto;
2321
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
24-
import org.apache.hadoop.hdds.scm.node.DatanodeInfo;
2522
import org.apache.hadoop.hdds.scm.node.NodeManager;
26-
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
2723
import org.apache.hadoop.hdds.upgrade.HDDSLayoutFeature;
2824
import org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager;
2925
import org.apache.hadoop.ozone.upgrade.BasicUpgradeFinalizer;
@@ -121,46 +117,13 @@ private void waitForDatanodesToFinalize(SCMUpgradeFinalizationContext context)
121117
// SCM is no longer the leader by throwing NotLeaderException.
122118
context.getSCMContext().getTermOfLeader();
123119

124-
allDatanodesFinalized = true;
125-
int totalHealthyNodes = 0;
126-
int finalizedNodes = 0;
127-
int unfinalizedNodes = 0;
128-
129-
for (DatanodeDetails dn : nodeManager.getAllNodes()) {
130-
try {
131-
// Only check HEALTHY nodes. STALE/DEAD nodes will be told to
132-
// finalize when they recover.
133-
if (nodeManager.getNodeStatus(dn).isHealthy()) {
134-
totalHealthyNodes++;
135-
DatanodeInfo datanodeInfo = nodeManager.getDatanodeInfo(dn);
136-
if (datanodeInfo == null) {
137-
LOG.warn("Could not get DatanodeInfo for {}, skipping in " +
138-
"finalization wait.", dn.getHostName());
139-
continue;
140-
}
141-
142-
LayoutVersionProto dnLayout = datanodeInfo.getLastKnownLayoutVersion();
143-
int dnMlv = dnLayout.getMetadataLayoutVersion();
144-
int dnSlv = dnLayout.getSoftwareLayoutVersion();
145-
146-
if (dnMlv < dnSlv) {
147-
// Datanode has not yet finalized
148-
allDatanodesFinalized = false;
149-
unfinalizedNodes++;
150-
LOG.debug("Datanode {} has not yet finalized: MLV={}, SLV={}",
151-
dn.getHostName(), dnMlv, dnSlv);
152-
} else {
153-
finalizedNodes++;
154-
}
155-
}
156-
} catch (NodeNotFoundException e) {
157-
// Node was removed while we were iterating. This is OK, skip it.
158-
LOG.debug("Node {} not found while waiting for finalization, " +
159-
"skipping.", dn);
160-
}
161-
}
120+
NodeManager.DatanodeFinalizationCounts datanodeFinalizationCounts = nodeManager.getDatanodeFinalizationCounts();
121+
int finalizedNodes = datanodeFinalizationCounts.getNumFinalizedDatanodes();
122+
int totalHealthyNodes = datanodeFinalizationCounts.getTotalHealthyDatanodes();
123+
allDatanodesFinalized = datanodeFinalizationCounts.allNodesFinalized();
162124

163125
if (!allDatanodesFinalized) {
126+
int unfinalizedNodes = totalHealthyNodes - finalizedNodes;
164127
LOG.info("Waiting for datanodes to finalize. Status: {}/{} healthy " +
165128
"datanodes have finalized ({} remaining).",
166129
finalizedNodes, totalHealthyNodes, unfinalizedNodes);

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,146 @@ public void testProcessLayoutVersion() throws IOException {
737737
testProcessLayoutVersionReportHigherMlv();
738738
}
739739

740+
@Test
741+
public void testDatanodeFinalizedCounterTracksLayoutVersionReports()
742+
throws IOException, AuthenticationException {
743+
try (SCMNodeManager nodeManager = createNodeManager(getConf())) {
744+
DatanodeDetails node =
745+
HddsTestUtils.createRandomDatanodeAndRegister(nodeManager);
746+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts()
747+
.getNumFinalizedDatanodes(),
748+
"Initial datanode should be counted as finalized");
749+
750+
int softwareVersion =
751+
nodeManager.getLayoutVersionManager().getSoftwareLayoutVersion();
752+
int metadataVersion =
753+
nodeManager.getLayoutVersionManager().getMetadataLayoutVersion();
754+
nodeManager.processLayoutVersionReport(node,
755+
LayoutVersionProto.newBuilder()
756+
.setMetadataLayoutVersion(metadataVersion - 1)
757+
.setSoftwareLayoutVersion(softwareVersion)
758+
.build());
759+
assertEquals(0, nodeManager.getDatanodeFinalizationCounts()
760+
.getNumFinalizedDatanodes(),
761+
"Lower metadata layout version should decrement finalized count");
762+
763+
nodeManager.processLayoutVersionReport(node,
764+
LayoutVersionProto.newBuilder()
765+
.setMetadataLayoutVersion(metadataVersion)
766+
.setSoftwareLayoutVersion(softwareVersion)
767+
.build());
768+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts()
769+
.getNumFinalizedDatanodes(),
770+
"Restored metadata layout version should restore finalized count");
771+
}
772+
}
773+
774+
@Test
775+
public void testDatanodeFinalizedCounterTracksRegistrationAndRemoveNode()
776+
throws IOException, AuthenticationException, NodeNotFoundException {
777+
try (SCMNodeManager nodeManager = createNodeManager(getConf())) {
778+
DatanodeDetails finalizedNode =
779+
registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success);
780+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts()
781+
.getNumFinalizedDatanodes(),
782+
"Finalized registration should increment finalized count");
783+
784+
DatanodeDetails nonFinalizedNode =
785+
registerWithCapacity(nodeManager, SMALLER_MLV_LAYOUT_PROTO, success);
786+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts()
787+
.getNumFinalizedDatanodes(),
788+
"Non-finalized registration should not increment finalized count");
789+
790+
nonFinalizedNode.setPersistedOpState(
791+
HddsProtos.NodeOperationalState.DECOMMISSIONED);
792+
nodeManager.removeNode(nonFinalizedNode);
793+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts()
794+
.getNumFinalizedDatanodes(),
795+
"Removing a non-finalized node should not change finalized count");
796+
797+
finalizedNode.setPersistedOpState(
798+
HddsProtos.NodeOperationalState.DECOMMISSIONED);
799+
nodeManager.removeNode(finalizedNode);
800+
assertEquals(0, nodeManager.getDatanodeFinalizationCounts().getNumFinalizedDatanodes(),
801+
"Removing a finalized node should decrement finalized count");
802+
}
803+
}
804+
805+
private static Stream<Arguments> ineligibleHealthStates() {
806+
return Stream.of(
807+
Arguments.of(NodeStatus.inServiceStale()),
808+
Arguments.of(NodeStatus.inServiceDead())
809+
);
810+
}
811+
812+
@ParameterizedTest
813+
@MethodSource("ineligibleHealthStates")
814+
public void testDatanodeFinalizedCounterExcludesNonHealthyNodes(NodeStatus expectedStatus)
815+
throws IOException, AuthenticationException, NodeNotFoundException, InterruptedException {
816+
OzoneConfiguration conf = getConf();
817+
conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL, 100, MILLISECONDS);
818+
conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 1, SECONDS);
819+
conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, SECONDS);
820+
conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 6, SECONDS);
821+
822+
try (SCMNodeManager nodeManager = createNodeManager(conf)) {
823+
// transitionNode stops heartbeating and will become STALE or DEAD
824+
DatanodeDetails transitionNode =
825+
registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success);
826+
// heartbeatingNode keeps heartbeating as a healthy baseline
827+
DatanodeDetails heartbeatingNode =
828+
registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success);
829+
830+
nodeManager.processHeartbeat(transitionNode);
831+
nodeManager.processHeartbeat(heartbeatingNode);
832+
833+
assertEquals(2, nodeManager.getDatanodeFinalizationCounts().getTotalHealthyDatanodes(),
834+
"Both nodes should start as healthy");
835+
836+
// Only heartbeat the baseline node until transitionNode reaches the expected state.
837+
// STALE requires > 3s (wait 4s), DEAD requires > 6s (wait 7s total).
838+
boolean waitForDead = expectedStatus.equals(NodeStatus.inServiceDead());
839+
Thread.sleep(2000);
840+
nodeManager.processHeartbeat(heartbeatingNode);
841+
Thread.sleep(2000);
842+
if (waitForDead) {
843+
nodeManager.processHeartbeat(heartbeatingNode);
844+
Thread.sleep(3000);
845+
}
846+
847+
assertEquals(expectedStatus, nodeManager.getNodeStatus(transitionNode),
848+
"Node should have transitioned to " + expectedStatus);
849+
850+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getTotalHealthyDatanodes(),
851+
expectedStatus + " node should be excluded from total count");
852+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getNumFinalizedDatanodes(),
853+
expectedStatus + " node should be excluded from finalized count");
854+
}
855+
}
856+
857+
private static Stream<Arguments> allOperationalStates() {
858+
return Stream.of(HddsProtos.NodeOperationalState.values())
859+
.map(Arguments::of);
860+
}
861+
862+
@ParameterizedTest
863+
@MethodSource("allOperationalStates")
864+
public void testDatanodeFinalizedCounterIncludesAllHealthyOpStates(
865+
HddsProtos.NodeOperationalState opState)
866+
throws IOException, AuthenticationException, NodeNotFoundException {
867+
try (SCMNodeManager nodeManager = createNodeManager(getConf())) {
868+
DatanodeDetails node =
869+
registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success);
870+
nodeManager.setNodeOperationalState(node, opState);
871+
872+
// All HEALTHY nodes should be counted regardless of operational state
873+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getTotalHealthyDatanodes(),
874+
"HEALTHY node with op state " + opState + " should be counted in total");
875+
assertEquals(1, nodeManager.getDatanodeFinalizationCounts().getNumFinalizedDatanodes(),
876+
"HEALTHY finalized node with op state " + opState + " should be counted as finalized");
877+
}
878+
}
879+
740880
// Currently invoked by testProcessLayoutVersion.
741881
public void testProcessLayoutVersionReportHigherMlv()
742882
throws IOException {

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMClientProtocolServer.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,18 @@ private StorageContainerManager mockStorageContainerManager() {
155155
return storageContainerManager;
156156
}
157157

158+
@Test
159+
public void testQueryUpgradeStatus() throws Exception {
160+
HddsProtos.UpgradeStatus status = server.queryUpgradeStatus();
161+
162+
// SCM starts already finalized in tests
163+
assertTrue(status.getScmFinalized());
164+
// No datanodes registered
165+
assertEquals(0, status.getNumDatanodesFinalized());
166+
assertEquals(0, status.getNumDatanodesTotal());
167+
assertTrue(status.getShouldFinalize());
168+
}
169+
158170
private ContainerInfo newContainerInfoForTest() {
159171
return new ContainerInfo.Builder()
160172
.setContainerID(1)

hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/upgrade/StatusSubCommand.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ public class StatusSubCommand extends ScmSubcommand {
3939
public void execute(ScmClient client) throws IOException {
4040
HddsProtos.UpgradeStatus status = client.queryUpgradeStatus();
4141

42-
// Temporary output to validate the command is working.
43-
out().println("Update status:");
42+
out().println("Upgrade status:");
4443
out().println(" SCM Finalized: " + status.getScmFinalized());
4544
out().println(" Datanodes finalized: " + status.getNumDatanodesFinalized());
4645
out().println(" Total Datanodes: " + status.getNumDatanodesTotal());

0 commit comments

Comments
 (0)