Skip to content

Commit 8b93ea4

Browse files
authored
HDDS-14977. Intermittent failure in TestDeadNodeHandler.testOnMessage (#10556)
1 parent 3d5b74b commit 8b93ea4

1 file changed

Lines changed: 11 additions & 2 deletions

File tree

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
import org.apache.hadoop.ozone.protocol.commands.DeleteBlocksCommand;
7474
import org.apache.hadoop.security.authentication.client.AuthenticationException;
7575
import org.apache.ozone.test.LambdaTestUtils;
76-
import org.apache.ozone.test.tag.Flaky;
7776
import org.junit.jupiter.api.AfterEach;
7877
import org.junit.jupiter.api.BeforeEach;
7978
import org.junit.jupiter.api.Test;
@@ -100,6 +99,11 @@ public void setup() throws IOException, AuthenticationException {
10099
OzoneConfiguration conf = new OzoneConfiguration();
101100
conf.setTimeDuration(HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT,
102101
0, TimeUnit.SECONDS);
102+
// The test drives node health transitions manually. Disable the periodic
103+
// health check so it does not resurrect a node forced to DEAD (the node's
104+
// heartbeat stays fresh), which would race with the handlers under test.
105+
conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL,
106+
1, TimeUnit.HOURS);
103107
conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 2);
104108
conf.setStorageSize(OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN,
105109
10, StorageUnit.MB);
@@ -139,7 +143,6 @@ public void teardown() {
139143

140144
@Test
141145
@SuppressWarnings("checkstyle:MethodLength")
142-
@Flaky("HDDS-14977")
143146
public void testOnMessage(@TempDir File tempDir) throws Exception {
144147
//GIVEN
145148
DatanodeDetails datanode1 = MockDatanodeDetails.randomDatanodeDetails();
@@ -266,6 +269,12 @@ public void testOnMessage(@TempDir File tempDir) throws Exception {
266269
nodeManager.addDatanodeCommand(datanode1.getID(), cmd);
267270
nodeManager.setNodeOperationalState(datanode1,
268271
HddsProtos.NodeOperationalState.IN_SERVICE);
272+
// Changing the operational state of a DEAD node fires a DEAD_NODE event on
273+
// SCM's event queue. Let SCM's own DeadNodeHandler process it here, so its
274+
// asynchronous topology removal does not race with the handlers driven
275+
// below (it could otherwise remove the node right after
276+
// HealthyReadOnlyNodeHandler re-adds it).
277+
((EventQueue) scm.getEventQueue()).processAll(60000L);
269278
setNodeHealthState(datanode1, HddsProtos.NodeState.DEAD);
270279
deadNodeHandler.onMessage(datanode1, publisher);
271280
//datanode1 has been removed from ClusterNetworkTopology, another

0 commit comments

Comments
 (0)