@@ -4601,31 +4601,31 @@ status:
46014601
46024602 // Assertion 2: The parent task group node should be in Running state
46034603 // The target node's parent task group is "subdag-iterate-jrfch-1295605062" (echo-group(0:foo))
4604- parentTaskGroupNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-1295605062" ]
4605- require .NotNil (t , parentTaskGroupNode , "Parent task group node should exist" )
4604+ parentTaskGroupNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-1295605062" ]
4605+ require .True (t , ok , "Parent task group node should exist" )
46064606 assert .Equal (t , wfv1 .NodeRunning , parentTaskGroupNode .Phase ,
46074607 "Parent task group node should be in Running state" )
46084608
46094609 // Assertion 3: Other nodes in different iterations should remain unchanged
46104610 // Check nodes from other iterations (bar and baz) remain succeeded
4611- barIterationNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-4005883592" ] // echo-group(1:bar)
4612- require .NotNil (t , barIterationNode , "Bar iteration node should exist" )
4611+ barIterationNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-4005883592" ] // echo-group(1:bar)
4612+ require .True (t , ok , "Bar iteration node should exist" )
46134613 assert .Equal (t , wfv1 .NodeSucceeded , barIterationNode .Phase ,
46144614 "Bar iteration node should remain succeeded" )
46154615
4616- bazIterationNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-851163487" ] // echo-group(2:baz)
4617- require .NotNil (t , bazIterationNode , "Baz iteration node should exist" )
4616+ bazIterationNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-851163487" ] // echo-group(2:baz)
4617+ require .True (t , ok , "Baz iteration node should exist" )
46184618 assert .Equal (t , wfv1 .NodeSucceeded , bazIterationNode .Phase ,
46194619 "Baz iteration node should remain succeeded" )
46204620
46214621 // Check that nodes from other iterations' pods remain succeeded
4622- barPodNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-2240770818" ] // bar echo-bye pod
4623- require .NotNil (t , barPodNode , "Bar pod node should exist" )
4622+ barPodNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-2240770818" ] // bar echo-bye pod
4623+ require .True (t , ok , "Bar pod node should exist" )
46244624 assert .Equal (t , wfv1 .NodeSucceeded , barPodNode .Phase ,
46254625 "Bar pod node should remain succeeded" )
46264626
4627- bazPodNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-656740947" ] // baz echo-bye pod
4628- require .NotNil (t , bazPodNode , "Baz pod node should exist" )
4627+ bazPodNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-656740947" ] // baz echo-bye pod
4628+ require .True (t , ok , "Baz pod node should exist" )
46294629 assert .Equal (t , wfv1 .NodeSucceeded , bazPodNode .Phase ,
46304630 "Baz pod node should remain succeeded" )
46314631
@@ -4635,14 +4635,14 @@ status:
46354635
46364636 // Assertion 5: Verify workflow integrity is maintained
46374637 // Check that the root workflow node is in Running state
4638- rootNode := retryWf .Status .Nodes ["subdag-iterate-jrfch" ]
4639- require .NotNil (t , rootNode , "Root workflow node should exist" )
4638+ rootNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch" ]
4639+ require .True (t , ok , "Root workflow node should exist" )
46404640 assert .Equal (t , wfv1 .NodeRunning , rootNode .Phase ,
46414641 "Root workflow node should be in Running state" )
46424642
46434643 // Check that the main task group is in Running state
4644- mainTaskGroupNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-1869003913" ]
4645- require .NotNil (t , mainTaskGroupNode , "Main task group node should exist" )
4644+ mainTaskGroupNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-1869003913" ]
4645+ require .True (t , ok , "Main task group node should exist" )
46464646 assert .Equal (t , wfv1 .NodeRunning , mainTaskGroupNode .Phase ,
46474647 "Main task group node should be in Running state" )
46484648
@@ -4656,15 +4656,228 @@ status:
46564656
46574657 // Additional verification: Check that the target node's sibling (echo-param) is NOT deleted
46584658 // since retrying echo-bye should only affect that specific node, not its siblings
4659- siblingNode := retryWf .Status .Nodes ["subdag-iterate-jrfch-3231869949" ] // foo echo-param pod
4660- require .NotNil (t , siblingNode , "Sibling node should remain in the workflow" )
4659+ siblingNode , ok := retryWf .Status .Nodes ["subdag-iterate-jrfch-3231869949" ] // foo echo-param pod
4660+ require .True (t , ok , "Sibling node should remain in the workflow" )
46614661 assert .Equal (t , wfv1 .NodeSucceeded , siblingNode .Phase ,
46624662 "Sibling node should remain succeeded" )
46634663
46644664 // Verify that the pod deletion list has the correct number of entries
4665- // Should include only the target node
46664665 assert .Len (t , podsToDelete , 1 ,
46674666 "Pod deletion list should contain 1 pod (only the target)" )
4668- assert .Contains (t , podsToDelete , "subdag-iterate-jrfch-echo-1786915540" ,
4669- "Pod deletion list should contain the target node's pod" )
4667+ }
4668+
4669+ // TestFormulateRetryWorkflowWithParamOverride tests issue #15802:
4670+ // When retrying a workflow with withParam + retryStrategy and overriding
4671+ // the workflow parameter to a different value, the stale expanded nodes
4672+ // from the old parameter value should be cleaned up.
4673+ //
4674+ // Scenario:
4675+ // - Workflow has wfparams=["pass","fail"], expanding to launch(0:pass) and launch(1:fail)
4676+ // - launch(0:pass) succeeds, launch(1:fail) fails -> workflow Failed
4677+ // - User retries with parameter override wfparams=["fail"]
4678+ // - Expected: launch(0:pass) and its children are removed (stale nodes)
4679+ // - Bug: launch(0:pass) survives and causes inconsistent node tree
4680+ func TestFormulateRetryWorkflowWithParamOverride (t * testing.T ) {
4681+ ctx := logging .TestContext (t .Context ())
4682+
4683+ workflowYaml := `apiVersion: argoproj.io/v1alpha1
4684+ kind: Workflow
4685+ metadata:
4686+ name: workflow-retry
4687+ namespace: argo
4688+ labels:
4689+ workflows.argoproj.io/completed: "true"
4690+ workflows.argoproj.io/phase: Failed
4691+ spec:
4692+ entrypoint: main
4693+ arguments:
4694+ parameters:
4695+ - name: wfparams
4696+ value: '["pass","fail"]'
4697+ templates:
4698+ - name: main
4699+ dag:
4700+ tasks:
4701+ - name: launch
4702+ template: passfail
4703+ arguments:
4704+ parameters:
4705+ - name: pfparam
4706+ value: "{{item}}"
4707+ withParam: "{{workflow.parameters.wfparams}}"
4708+ - name: passfail
4709+ retryStrategy:
4710+ limit: "0"
4711+ inputs:
4712+ parameters:
4713+ - name: pfparam
4714+ container:
4715+ image: alpine:3.15.4
4716+ command: [sh, -c]
4717+ args:
4718+ - |
4719+ if [ "{{inputs.parameters.pfparam}}" = "fail" ]; then exit 1; else exit 0; fi
4720+ status:
4721+ phase: Failed
4722+ conditions:
4723+ - status: "True"
4724+ type: Completed
4725+ nodes:
4726+ workflow-retry:
4727+ id: workflow-retry
4728+ name: workflow-retry
4729+ displayName: workflow-retry
4730+ type: DAG
4731+ phase: Failed
4732+ templateName: main
4733+ templateScope: local/workflow-retry
4734+ startedAt: "2026-03-24T00:40:00Z"
4735+ finishedAt: "2026-03-24T00:42:32Z"
4736+ children:
4737+ - workflow-retry-2014360908
4738+ outboundNodes:
4739+ - workflow-retry-3976347509
4740+ - workflow-retry-1861571805
4741+ workflow-retry-2014360908:
4742+ id: workflow-retry-2014360908
4743+ name: workflow-retry.launch
4744+ displayName: launch
4745+ type: TaskGroup
4746+ phase: Failed
4747+ boundaryID: workflow-retry
4748+ templateName: passfail
4749+ templateScope: local/workflow-retry
4750+ startedAt: "2026-03-24T00:40:00Z"
4751+ finishedAt: "2026-03-24T00:42:22Z"
4752+ nodeFlag: {}
4753+ children:
4754+ - workflow-retry-816771446
4755+ - workflow-retry-1743784750
4756+ workflow-retry-816771446:
4757+ id: workflow-retry-816771446
4758+ name: workflow-retry.launch(0:pass)
4759+ displayName: launch(0:pass)
4760+ type: Retry
4761+ phase: Succeeded
4762+ boundaryID: workflow-retry
4763+ templateName: passfail
4764+ templateScope: local/workflow-retry
4765+ startedAt: "2026-03-24T00:40:00Z"
4766+ finishedAt: "2026-03-24T00:41:25Z"
4767+ nodeFlag:
4768+ retried: true
4769+ children:
4770+ - workflow-retry-3976347509
4771+ workflow-retry-3976347509:
4772+ id: workflow-retry-3976347509
4773+ name: workflow-retry.launch(0:pass)(0)
4774+ displayName: launch(0:pass)(0)
4775+ type: Pod
4776+ phase: Succeeded
4777+ boundaryID: workflow-retry
4778+ hostNodeName: node1
4779+ templateName: passfail
4780+ templateScope: local/workflow-retry
4781+ startedAt: "2026-03-24T00:40:00Z"
4782+ finishedAt: "2026-03-24T00:41:25Z"
4783+ taskResultSynced: true
4784+ inputs:
4785+ parameters:
4786+ - name: pfparam
4787+ value: pass
4788+ outputs:
4789+ exitCode: "0"
4790+ workflow-retry-1743784750:
4791+ id: workflow-retry-1743784750
4792+ name: workflow-retry.launch(1:fail)
4793+ displayName: launch(1:fail)
4794+ type: Retry
4795+ phase: Failed
4796+ boundaryID: workflow-retry
4797+ templateName: passfail
4798+ templateScope: local/workflow-retry
4799+ startedAt: "2026-03-24T00:40:00Z"
4800+ finishedAt: "2026-03-24T00:42:22Z"
4801+ message: No more retries left
4802+ nodeFlag:
4803+ retried: true
4804+ children:
4805+ - workflow-retry-1861571805
4806+ workflow-retry-1861571805:
4807+ id: workflow-retry-1861571805
4808+ name: workflow-retry.launch(1:fail)(0)
4809+ displayName: launch(1:fail)(0)
4810+ type: Pod
4811+ phase: Failed
4812+ boundaryID: workflow-retry
4813+ hostNodeName: node1
4814+ templateName: passfail
4815+ templateScope: local/workflow-retry
4816+ startedAt: "2026-03-24T00:40:00Z"
4817+ finishedAt: "2026-03-24T00:42:22Z"
4818+ taskResultSynced: true
4819+ inputs:
4820+ parameters:
4821+ - name: pfparam
4822+ value: fail
4823+ outputs:
4824+ exitCode: "1"
4825+ startedAt: "2026-03-24T00:40:00Z"
4826+ finishedAt: "2026-03-24T00:42:32Z"`
4827+
4828+ wf := wfv1 .MustUnmarshalWorkflow (workflowYaml )
4829+
4830+ // Retry with parameter override: change wfparams to ["fail"]
4831+ parameters := []string {`wfparams=["fail"]` }
4832+ retryWf , podsToDelete , err := FormulateRetryWorkflow (ctx , wf , false , "" , parameters )
4833+ require .NoError (t , err )
4834+ require .NotNil (t , retryWf )
4835+
4836+ // Verify the workflow parameter was overridden
4837+ for _ , p := range retryWf .Spec .Arguments .Parameters {
4838+ if p .Name == "wfparams" {
4839+ assert .Equal (t , `["fail"]` , p .GetValue (),
4840+ "wfparams should be overridden to [\" fail\" ]" )
4841+ }
4842+ }
4843+
4844+ // BUG ASSERTION (issue #15802):
4845+ // The stale node launch(0:pass) from the old parameter expansion should NOT exist
4846+ // in the retry workflow, because the new parameter ["fail"] will expand differently.
4847+ // If it still exists, the controller will create launch(0:fail) alongside the stale
4848+ // launch(0:pass), resulting in two children in the TaskGroup and a stuck workflow.
4849+ _ , staleNodeExists := retryWf .Status .Nodes ["workflow-retry-816771446" ] // launch(0:pass)
4850+ assert .False (t , staleNodeExists ,
4851+ "Stale node launch(0:pass) should be deleted when parameter override changes withParam expansion" )
4852+
4853+ _ , stalePodExists := retryWf .Status .Nodes ["workflow-retry-3976347509" ] // launch(0:pass)(0)
4854+ assert .False (t , stalePodExists ,
4855+ "Stale pod launch(0:pass)(0) should be deleted when parameter override changes withParam expansion" )
4856+
4857+ // The failed nodes should be deleted (standard retry behavior)
4858+ _ , failedRetryExists := retryWf .Status .Nodes ["workflow-retry-1743784750" ] // launch(1:fail)
4859+ assert .False (t , failedRetryExists ,
4860+ "Failed retry node launch(1:fail) should be deleted" )
4861+
4862+ _ , failedPodExists := retryWf .Status .Nodes ["workflow-retry-1861571805" ] // launch(1:fail)(0)
4863+ assert .False (t , failedPodExists ,
4864+ "Failed pod launch(1:fail)(0) should be deleted" )
4865+
4866+ // The TaskGroup should be reset to Running
4867+ taskGroupNode , ok := retryWf .Status .Nodes ["workflow-retry-2014360908" ] // launch TaskGroup
4868+ require .True (t , ok , "TaskGroup node should exist" )
4869+ assert .Equal (t , wfv1 .NodeRunning , taskGroupNode .Phase ,
4870+ "TaskGroup should be reset to Running" )
4871+
4872+ // The TaskGroup should have NO children (all old expansion nodes should be cleaned up)
4873+ assert .Empty (t , taskGroupNode .Children ,
4874+ "TaskGroup children should be empty after parameter override changes the expansion" )
4875+
4876+ // The failed pod should be in the deletion list
4877+ assert .Contains (t , podsToDelete , "workflow-retry-passfail-1861571805" ,
4878+ "Failed pod should be in deletion list" )
4879+
4880+ // The succeeded pod from the stale expansion should also be in the deletion list
4881+ assert .Contains (t , podsToDelete , "workflow-retry-passfail-3976347509" ,
4882+ "Stale succeeded pod should also be in deletion list" )
46704883}
0 commit comments