7373import org .apache .hadoop .ozone .protocol .commands .DeleteBlocksCommand ;
7474import org .apache .hadoop .security .authentication .client .AuthenticationException ;
7575import org .apache .ozone .test .LambdaTestUtils ;
76- import org .apache .ozone .test .tag .Flaky ;
7776import org .junit .jupiter .api .AfterEach ;
7877import org .junit .jupiter .api .BeforeEach ;
7978import org .junit .jupiter .api .Test ;
@@ -100,6 +99,11 @@ public void setup() throws IOException, AuthenticationException {
10099 OzoneConfiguration conf = new OzoneConfiguration ();
101100 conf .setTimeDuration (HddsConfigKeys .HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT ,
102101 0 , TimeUnit .SECONDS );
102+ // The test drives node health transitions manually. Disable the periodic
103+ // health check so it does not resurrect a node forced to DEAD (the node's
104+ // heartbeat stays fresh), which would race with the handlers under test.
105+ conf .setTimeDuration (ScmConfigKeys .OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL ,
106+ 1 , TimeUnit .HOURS );
103107 conf .setInt (ScmConfigKeys .OZONE_DATANODE_PIPELINE_LIMIT , 2 );
104108 conf .setStorageSize (OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN ,
105109 10 , StorageUnit .MB );
@@ -139,7 +143,6 @@ public void teardown() {
139143
140144 @ Test
141145 @ SuppressWarnings ("checkstyle:MethodLength" )
142- @ Flaky ("HDDS-14977" )
143146 public void testOnMessage (@ TempDir File tempDir ) throws Exception {
144147 //GIVEN
145148 DatanodeDetails datanode1 = MockDatanodeDetails .randomDatanodeDetails ();
@@ -266,6 +269,12 @@ public void testOnMessage(@TempDir File tempDir) throws Exception {
266269 nodeManager .addDatanodeCommand (datanode1 .getID (), cmd );
267270 nodeManager .setNodeOperationalState (datanode1 ,
268271 HddsProtos .NodeOperationalState .IN_SERVICE );
272+ // Changing the operational state of a DEAD node fires a DEAD_NODE event on
273+ // SCM's event queue. Let SCM's own DeadNodeHandler process it here, so its
274+ // asynchronous topology removal does not race with the handlers driven
275+ // below (it could otherwise remove the node right after
276+ // HealthyReadOnlyNodeHandler re-adds it).
277+ ((EventQueue ) scm .getEventQueue ()).processAll (60000L );
269278 setNodeHealthState (datanode1 , HddsProtos .NodeState .DEAD );
270279 deadNodeHandler .onMessage (datanode1 , publisher );
271280 //datanode1 has been removed from ClusterNetworkTopology, another
0 commit comments