From ab11c16ab274599c71c6e69e1af99f14540b642f Mon Sep 17 00:00:00 2001 From: Chi-Hsuan Huang Date: Sat, 20 Jun 2026 10:49:00 +0800 Subject: [PATCH 1/2] HDDS-14977. Intermittent failure in TestDeadNodeHandler.testOnMessage --- .../apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java index b52733cdc100..2aabbc94230b 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java @@ -73,7 +73,6 @@ import org.apache.hadoop.ozone.protocol.commands.DeleteBlocksCommand; import org.apache.hadoop.security.authentication.client.AuthenticationException; import org.apache.ozone.test.LambdaTestUtils; -import org.apache.ozone.test.tag.Flaky; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -100,6 +99,11 @@ public void setup() throws IOException, AuthenticationException { OzoneConfiguration conf = new OzoneConfiguration(); conf.setTimeDuration(HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT, 0, TimeUnit.SECONDS); + // The test drives node health transitions manually. Disable the periodic + // health check so it does not resurrect a node forced to DEAD (the node's + // heartbeat stays fresh), which would race with the handlers under test. + conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL, + 1, TimeUnit.HOURS); conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 2); conf.setStorageSize(OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN, 10, StorageUnit.MB); @@ -139,7 +143,6 @@ public void teardown() { @Test @SuppressWarnings("checkstyle:MethodLength") - @Flaky("HDDS-14977") public void testOnMessage(@TempDir File tempDir) throws Exception { //GIVEN DatanodeDetails datanode1 = MockDatanodeDetails.randomDatanodeDetails(); From fce88e670bc9bc931b925b9d56676f6200dddbb1 Mon Sep 17 00:00:00 2001 From: Chi-Hsuan Huang Date: Sun, 21 Jun 2026 10:26:44 +0800 Subject: [PATCH 2/2] HDDS-14977. Drain SCM event queue to avoid async DeadNodeHandler race --- .../apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java index 2aabbc94230b..7b28de473eaf 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java @@ -269,6 +269,12 @@ public void testOnMessage(@TempDir File tempDir) throws Exception { nodeManager.addDatanodeCommand(datanode1.getID(), cmd); nodeManager.setNodeOperationalState(datanode1, HddsProtos.NodeOperationalState.IN_SERVICE); + // Changing the operational state of a DEAD node fires a DEAD_NODE event on + // SCM's event queue. Let SCM's own DeadNodeHandler process it here, so its + // asynchronous topology removal does not race with the handlers driven + // below (it could otherwise remove the node right after + // HealthyReadOnlyNodeHandler re-adds it). + ((EventQueue) scm.getEventQueue()).processAll(60000L); setNodeHealthState(datanode1, HddsProtos.NodeState.DEAD); deadNodeHandler.onMessage(datanode1, publisher); //datanode1 has been removed from ClusterNetworkTopology, another