113113import io .fabric8 .kubernetes .client .server .mock .EnableKubernetesMockClient ;
114114import io .fabric8 .kubernetes .client .server .mock .KubernetesMockServer ;
115115import lombok .SneakyThrows ;
116+ import org .jetbrains .annotations .NotNull ;
116117import org .junit .jupiter .api .Assertions ;
117118import org .junit .jupiter .api .BeforeEach ;
118119import org .junit .jupiter .api .Test ;
@@ -334,15 +335,7 @@ public void cancelJobWithStatelessUpgradeModeTest() throws Exception {
334335 @ ValueSource (ints = {404 , 409 , 500 })
335336 public void cancelErrorHandling (int statusCode ) throws Exception {
336337
337- var testingClusterClient =
338- new TestingClusterClient <>(configuration , TestUtils .TEST_DEPLOYMENT_NAME );
339- testingClusterClient .setCancelFunction (
340- jobID ->
341- CompletableFuture .failedFuture (
342- new RuntimeException (
343- new RestClientException (
344- "errrr" , HttpResponseStatus .valueOf (statusCode )))));
345- var flinkService = new TestingService (testingClusterClient );
338+ var flinkService = getTestingService ("errrr" , HttpResponseStatus .valueOf (statusCode ));
346339
347340 JobID jobID = JobID .generate ();
348341 var job = TestUtils .buildSessionJob ();
@@ -360,10 +353,104 @@ public void cancelErrorHandling(int statusCode) throws Exception {
360353 assertEquals (RUNNING , jobStatus .getState ());
361354 } else {
362355 flinkService .cancelSessionJob (job , SuspendMode .STATELESS , new Configuration ());
363- assertEquals (CANCELLING , jobStatus .getState ());
356+ assertEquals (FINISHED , jobStatus .getState ());
357+ assertNull (jobStatus .getJobId ());
364358 }
365359 }
366360
361+ @ Test
362+ public void cancelErrorHandlingWithTerminalStateMessage () throws Exception {
363+ var flinkService =
364+ getTestingService (
365+ "Job cancellation failed because the job has already reached another terminal state (FAILED)." ,
366+ HttpResponseStatus .BAD_REQUEST );
367+
368+ JobID jobID = JobID .generate ();
369+ var job = TestUtils .buildSessionJob ();
370+ var jobStatus = job .getStatus ().getJobStatus ();
371+ jobStatus .setJobId (jobID .toHexString ());
372+ jobStatus .setState (RUNNING );
373+ ReconciliationUtils .updateStatusForDeployedSpec (job , new Configuration ());
374+
375+ flinkService .cancelSessionJob (job , SuspendMode .STATELESS , new Configuration ());
376+ assertEquals (FINISHED , jobStatus .getState ());
377+ assertNull (jobStatus .getJobId ());
378+ }
379+
380+ /**
381+ * Reproduces the operator-upgrade scenario for Session Mode with CANCEL upgrade mode: when a
382+ * running session job's JobManager has already moved the job into a terminal state (e.g.
383+ * FAILED) and the operator (after a restart/upgrade) tries to cancel it, the cancellation
384+ * request comes back with "already reached another terminal state". Previously this caused the
385+ * finalizer to never be removed, leaving the CR stuck in Terminating.
386+ */
387+ @ Test
388+ public void cancelSessionJobWithCancelModeAndTerminalStateMessage () throws Exception {
389+ var flinkService =
390+ getTestingService (
391+ "Job cancellation failed because the job has already reached another terminal state (FAILED)." ,
392+ HttpResponseStatus .BAD_REQUEST );
393+
394+ JobID jobID = JobID .generate ();
395+ var job = TestUtils .buildSessionJob ();
396+ var jobStatus = job .getStatus ().getJobStatus ();
397+ jobStatus .setJobId (jobID .toHexString ());
398+ jobStatus .setState (RUNNING );
399+ ReconciliationUtils .updateStatusForDeployedSpec (job , new Configuration ());
400+
401+ var result = flinkService .cancelSessionJob (job , SuspendMode .CANCEL , new Configuration ());
402+ // Must NOT be pending — the CR would otherwise be stuck in Terminating indefinitely
403+ assertFalse (result .isPending ());
404+ assertEquals (FINISHED , jobStatus .getState ());
405+ assertNull (jobStatus .getJobId ());
406+ }
407+
408+ @ NotNull
409+ private TestingService getTestingService (String message , HttpResponseStatus badRequest )
410+ throws Exception {
411+ final var testingClusterClient =
412+ new TestingClusterClient <>(configuration , TestUtils .TEST_DEPLOYMENT_NAME );
413+ testingClusterClient .setCancelFunction (
414+ jobID ->
415+ CompletableFuture .failedFuture (
416+ new RuntimeException (
417+ new RestClientException (message , badRequest ))));
418+ return new TestingService (testingClusterClient );
419+ }
420+
421+ /**
422+ * Reproduces FLINK-37766 for Application Mode: when a running application job's JobManager has
423+ * moved the job to a terminal state (e.g. FAILED due to HA desync) and the operator tries to
424+ * cancel the job with CANCEL suspend mode (used for last-state upgrades), the "already reached
425+ * another terminal state" response previously caused the operator to always return
426+ * CancelResult.pending(), looping forever without completing the upgrade/deletion.
427+ */
428+ @ Test
429+ public void cancelApplicationJobWithCancelModeAndTerminalStateMessage () throws Exception {
430+ var flinkService =
431+ getTestingService (
432+ "Job cancellation failed because the job has already reached another terminal state (FAILED)." ,
433+ HttpResponseStatus .BAD_REQUEST );
434+
435+ JobID jobID = JobID .generate ();
436+ FlinkDeployment deployment = TestUtils .buildApplicationCluster ();
437+ deployment .getStatus ().setJobManagerDeploymentStatus (JobManagerDeploymentStatus .READY );
438+ JobStatus jobStatus = deployment .getStatus ().getJobStatus ();
439+ jobStatus .setJobId (jobID .toHexString ());
440+ jobStatus .setState (RUNNING );
441+ ReconciliationUtils .updateStatusForDeployedSpec (deployment , new Configuration ());
442+
443+ var result =
444+ flinkService .cancelJob (
445+ deployment ,
446+ SuspendMode .CANCEL ,
447+ configManager .getObserveConfig (deployment ),
448+ false );
449+ // Must NOT be pending — the operator would otherwise loop forever on the upgrade
450+ assertFalse (result .isPending ());
451+ assertEquals (FINISHED , jobStatus .getState ());
452+ }
453+
367454 @ ParameterizedTest
368455 @ ValueSource (booleans = {true , false })
369456 public void cancelJobWithSavepointUpgradeModeTest (boolean deleteAfterSavepoint )
0 commit comments