|
32 | 32 | import io.javaoperatorsdk.operator.api.reconciler.Context; |
33 | 33 | import org.junit.jupiter.api.BeforeEach; |
34 | 34 | import org.junit.jupiter.params.ParameterizedTest; |
| 35 | +import org.junit.jupiter.params.provider.EnumSource; |
35 | 36 | import org.junit.jupiter.params.provider.MethodSource; |
36 | 37 |
|
37 | 38 | import java.time.Duration; |
38 | 39 |
|
| 40 | +import static org.apache.flink.api.common.JobStatus.FAILED; |
39 | 41 | import static org.apache.flink.api.common.JobStatus.RUNNING; |
40 | 42 | import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_ENABLED; |
41 | 43 | import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW; |
@@ -156,4 +158,81 @@ public void verifyApplicationNoCompletedCheckpointsJmRecovery( |
156 | 158 | appCluster.getStatus().getJobManagerDeploymentStatus()); |
157 | 159 | assertEquals(RUNNING, appCluster.getStatus().getJobStatus().getState()); |
158 | 160 | } |
| 161 | + |
| 162 | + @ParameterizedTest |
| 163 | + @MethodSource("org.apache.flink.kubernetes.operator.TestUtils#flinkVersionsAndUpgradeModes") |
| 164 | + public void verifyTerminallyFailedJobNotRestartedByHealthCheck( |
| 165 | + FlinkVersion flinkVersion, UpgradeMode upgradeMode) throws Exception { |
| 166 | + FlinkDeployment appCluster = TestUtils.buildApplicationCluster(flinkVersion); |
| 167 | + appCluster.getSpec().getJob().setUpgradeMode(upgradeMode); |
| 168 | + |
| 169 | + // Start a healthy deployment |
| 170 | + flinkService.setMetricValue(NUM_RESTARTS_METRIC_NAME, "0"); |
| 171 | + flinkService.setMetricValue(NUMBER_OF_COMPLETED_CHECKPOINTS_METRIC_NAME, "1"); |
| 172 | + testController.reconcile(appCluster, context); |
| 173 | + testController.reconcile(appCluster, context); |
| 174 | + testController.reconcile(appCluster, context); |
| 175 | + assertEquals( |
| 176 | + JobManagerDeploymentStatus.READY, |
| 177 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 178 | + assertEquals(RUNNING, appCluster.getStatus().getJobStatus().getState()); |
| 179 | + |
| 180 | + // Mark job as terminally FAILED |
| 181 | + flinkService.markApplicationJobFailedWithError( |
| 182 | + flinkService.listJobs().get(0).f1.getJobId(), "Terminal failure"); |
| 183 | + |
| 184 | + // Age the checkpoint health data to simulate an unhealthy evaluation |
| 185 | + // (no checkpoint progress within the window), which would normally trigger a restart |
| 186 | + var clusterHealthInfo = |
| 187 | + ClusterHealthObserver.CLUSTER_HEALTH_INFOS.get(ResourceID.fromResource(appCluster)); |
| 188 | + clusterHealthInfo.setNumCompletedCheckpointsIncreasedTimeStamp( |
| 189 | + clusterHealthInfo.getNumCompletedCheckpointsIncreasedTimeStamp() - 1200000); |
| 190 | + testController.getStatusRecorder().patchAndCacheStatus(appCluster, kubernetesClient); |
| 191 | + |
| 192 | + // Reconcile - FAILED terminal job must NOT be restarted via the health-check codepath. |
| 193 | + // The health-based restart path requires HA metadata which a terminated job does not have, |
| 194 | + // and restarting a terminal job is controlled exclusively by OPERATOR_JOB_RESTART_FAILED. |
| 195 | + testController.reconcile(appCluster, context); |
| 196 | + |
| 197 | + assertEquals(FAILED, appCluster.getStatus().getJobStatus().getState()); |
| 198 | + assertEquals( |
| 199 | + JobManagerDeploymentStatus.READY, |
| 200 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 201 | + } |
| 202 | + |
| 203 | + /** |
| 204 | + * For stateful (LAST_STATE / SAVEPOINT) upgrade modes, a health-based restart must NOT be |
| 205 | + * triggered when HA metadata is absent. Without HA metadata the restart would immediately fail |
| 206 | + * with an UpgradeFailureException, so the check must be skipped entirely. |
| 207 | + */ |
| 208 | + @ParameterizedTest |
| 209 | + @EnumSource( |
| 210 | + value = UpgradeMode.class, |
| 211 | + names = {"LAST_STATE", "SAVEPOINT"}) |
| 212 | + public void verifyUnhealthyRestartSkippedWhenHaMetadataAbsent(UpgradeMode upgradeMode) |
| 213 | + throws Exception { |
| 214 | + FlinkDeployment appCluster = TestUtils.buildApplicationCluster(); |
| 215 | + appCluster.getSpec().getJob().setUpgradeMode(upgradeMode); |
| 216 | + |
| 217 | + // Start a healthy deployment (HA metadata available by default) |
| 218 | + testController.reconcile(appCluster, context); |
| 219 | + testController.reconcile(appCluster, context); |
| 220 | + testController.reconcile(appCluster, context); |
| 221 | + assertEquals( |
| 222 | + JobManagerDeploymentStatus.READY, |
| 223 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 224 | + assertEquals(RUNNING, appCluster.getStatus().getJobStatus().getState()); |
| 225 | + |
| 226 | + // Simulate unhealthy cluster (restart count exceeds threshold) while HA metadata is absent |
| 227 | + flinkService.setMetricValue(NUM_RESTARTS_METRIC_NAME, "100"); |
| 228 | + flinkService.setHaDataAvailable(false); |
| 229 | + |
| 230 | + testController.reconcile(appCluster, context); |
| 231 | + |
| 232 | + // Health-based restart must NOT be triggered when HA metadata is absent |
| 233 | + assertEquals(RUNNING, appCluster.getStatus().getJobStatus().getState()); |
| 234 | + assertEquals( |
| 235 | + JobManagerDeploymentStatus.READY, |
| 236 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 237 | + } |
159 | 238 | } |
0 commit comments