Skip to content

Commit d191ce7

Browse files
fix: delete stale TaskGroup children on retry with parameter override. Fixes #15802
When retrying a workflow with parameter override that changes a withParam/withItems/withSequence expansion, FormulateRetryWorkflow now deletes all children of reset TaskGroup/StepGroup nodes. Previously, stale expansion nodes (e.g. launch(0:pass) from the old parameter value) survived the retry and accumulated alongside newly expanded nodes, causing the workflow to get stuck in Running status. Also align test assertions with codebase conventions: - Use _, ok map access pattern instead of require.NotNil on value types - Remove dead wfClientSet code - Remove duplicate assert.Contains - Condense production code comment Signed-off-by: Stanley Shen <stanley.shen2003@gmail.com> Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 1e95e6d commit d191ce7

2 files changed

Lines changed: 250 additions & 19 deletions

File tree

workflow/util/util.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,6 +1233,24 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
12331233
toDelete = setUnion(toDelete, pathToDelete)
12341234
}
12351235

1236+
// Delete children of TaskGroup/StepGroup nodes being reset when parameters are overridden,
1237+
// so the controller can re-expand them with the new values. Fixes #15802.
1238+
if len(parameters) > 0 {
1239+
for nodeID := range toReset {
1240+
if toDelete[nodeID] {
1241+
continue
1242+
}
1243+
n := wf.Status.Nodes[nodeID]
1244+
if n.Type == wfv1.NodeTypeTaskGroup || n.Type == wfv1.NodeTypeStepGroup {
1245+
if dagNode, ok := nodesMap[nodeID]; ok {
1246+
for childID := range getChildren(dagNode) {
1247+
toDelete[childID] = true
1248+
}
1249+
}
1250+
}
1251+
}
1252+
}
1253+
12361254
for nodeID := range toReset {
12371255
// avoid resetting nodes that are marked for deletion
12381256
if in := toDelete[nodeID]; in {

workflow/util/util_test.go

Lines changed: 232 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4601,31 +4601,31 @@ status:
46014601

46024602
// Assertion 2: The parent task group node should be in Running state
46034603
// The target node's parent task group is "subdag-iterate-jrfch-1295605062" (echo-group(0:foo))
4604-
parentTaskGroupNode := retryWf.Status.Nodes["subdag-iterate-jrfch-1295605062"]
4605-
require.NotNil(t, parentTaskGroupNode, "Parent task group node should exist")
4604+
parentTaskGroupNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-1295605062"]
4605+
require.True(t, ok, "Parent task group node should exist")
46064606
assert.Equal(t, wfv1.NodeRunning, parentTaskGroupNode.Phase,
46074607
"Parent task group node should be in Running state")
46084608

46094609
// Assertion 3: Other nodes in different iterations should remain unchanged
46104610
// Check nodes from other iterations (bar and baz) remain succeeded
4611-
barIterationNode := retryWf.Status.Nodes["subdag-iterate-jrfch-4005883592"] // echo-group(1:bar)
4612-
require.NotNil(t, barIterationNode, "Bar iteration node should exist")
4611+
barIterationNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-4005883592"] // echo-group(1:bar)
4612+
require.True(t, ok, "Bar iteration node should exist")
46134613
assert.Equal(t, wfv1.NodeSucceeded, barIterationNode.Phase,
46144614
"Bar iteration node should remain succeeded")
46154615

4616-
bazIterationNode := retryWf.Status.Nodes["subdag-iterate-jrfch-851163487"] // echo-group(2:baz)
4617-
require.NotNil(t, bazIterationNode, "Baz iteration node should exist")
4616+
bazIterationNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-851163487"] // echo-group(2:baz)
4617+
require.True(t, ok, "Baz iteration node should exist")
46184618
assert.Equal(t, wfv1.NodeSucceeded, bazIterationNode.Phase,
46194619
"Baz iteration node should remain succeeded")
46204620

46214621
// Check that nodes from other iterations' pods remain succeeded
4622-
barPodNode := retryWf.Status.Nodes["subdag-iterate-jrfch-2240770818"] // bar echo-bye pod
4623-
require.NotNil(t, barPodNode, "Bar pod node should exist")
4622+
barPodNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-2240770818"] // bar echo-bye pod
4623+
require.True(t, ok, "Bar pod node should exist")
46244624
assert.Equal(t, wfv1.NodeSucceeded, barPodNode.Phase,
46254625
"Bar pod node should remain succeeded")
46264626

4627-
bazPodNode := retryWf.Status.Nodes["subdag-iterate-jrfch-656740947"] // baz echo-bye pod
4628-
require.NotNil(t, bazPodNode, "Baz pod node should exist")
4627+
bazPodNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-656740947"] // baz echo-bye pod
4628+
require.True(t, ok, "Baz pod node should exist")
46294629
assert.Equal(t, wfv1.NodeSucceeded, bazPodNode.Phase,
46304630
"Baz pod node should remain succeeded")
46314631

@@ -4635,14 +4635,14 @@ status:
46354635

46364636
// Assertion 5: Verify workflow integrity is maintained
46374637
// Check that the root workflow node is in Running state
4638-
rootNode := retryWf.Status.Nodes["subdag-iterate-jrfch"]
4639-
require.NotNil(t, rootNode, "Root workflow node should exist")
4638+
rootNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch"]
4639+
require.True(t, ok, "Root workflow node should exist")
46404640
assert.Equal(t, wfv1.NodeRunning, rootNode.Phase,
46414641
"Root workflow node should be in Running state")
46424642

46434643
// Check that the main task group is in Running state
4644-
mainTaskGroupNode := retryWf.Status.Nodes["subdag-iterate-jrfch-1869003913"]
4645-
require.NotNil(t, mainTaskGroupNode, "Main task group node should exist")
4644+
mainTaskGroupNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-1869003913"]
4645+
require.True(t, ok, "Main task group node should exist")
46464646
assert.Equal(t, wfv1.NodeRunning, mainTaskGroupNode.Phase,
46474647
"Main task group node should be in Running state")
46484648

@@ -4656,15 +4656,228 @@ status:
46564656

46574657
// Additional verification: Check that the target node's sibling (echo-param) is NOT deleted
46584658
// since retrying echo-bye should only affect that specific node, not its siblings
4659-
siblingNode := retryWf.Status.Nodes["subdag-iterate-jrfch-3231869949"] // foo echo-param pod
4660-
require.NotNil(t, siblingNode, "Sibling node should remain in the workflow")
4659+
siblingNode, ok := retryWf.Status.Nodes["subdag-iterate-jrfch-3231869949"] // foo echo-param pod
4660+
require.True(t, ok, "Sibling node should remain in the workflow")
46614661
assert.Equal(t, wfv1.NodeSucceeded, siblingNode.Phase,
46624662
"Sibling node should remain succeeded")
46634663

46644664
// Verify that the pod deletion list has the correct number of entries
4665-
// Should include only the target node
46664665
assert.Len(t, podsToDelete, 1,
46674666
"Pod deletion list should contain 1 pod (only the target)")
4668-
assert.Contains(t, podsToDelete, "subdag-iterate-jrfch-echo-1786915540",
4669-
"Pod deletion list should contain the target node's pod")
4667+
}
4668+
4669+
// TestFormulateRetryWorkflowWithParamOverride tests issue #15802:
4670+
// When retrying a workflow with withParam + retryStrategy and overriding
4671+
// the workflow parameter to a different value, the stale expanded nodes
4672+
// from the old parameter value should be cleaned up.
4673+
//
4674+
// Scenario:
4675+
// - Workflow has wfparams=["pass","fail"], expanding to launch(0:pass) and launch(1:fail)
4676+
// - launch(0:pass) succeeds, launch(1:fail) fails -> workflow Failed
4677+
// - User retries with parameter override wfparams=["fail"]
4678+
// - Expected: launch(0:pass) and its children are removed (stale nodes)
4679+
// - Bug: launch(0:pass) survives and causes inconsistent node tree
4680+
func TestFormulateRetryWorkflowWithParamOverride(t *testing.T) {
4681+
ctx := logging.TestContext(t.Context())
4682+
4683+
workflowYaml := `apiVersion: argoproj.io/v1alpha1
4684+
kind: Workflow
4685+
metadata:
4686+
name: workflow-retry
4687+
namespace: argo
4688+
labels:
4689+
workflows.argoproj.io/completed: "true"
4690+
workflows.argoproj.io/phase: Failed
4691+
spec:
4692+
entrypoint: main
4693+
arguments:
4694+
parameters:
4695+
- name: wfparams
4696+
value: '["pass","fail"]'
4697+
templates:
4698+
- name: main
4699+
dag:
4700+
tasks:
4701+
- name: launch
4702+
template: passfail
4703+
arguments:
4704+
parameters:
4705+
- name: pfparam
4706+
value: "{{item}}"
4707+
withParam: "{{workflow.parameters.wfparams}}"
4708+
- name: passfail
4709+
retryStrategy:
4710+
limit: "0"
4711+
inputs:
4712+
parameters:
4713+
- name: pfparam
4714+
container:
4715+
image: alpine:3.15.4
4716+
command: [sh, -c]
4717+
args:
4718+
- |
4719+
if [ "{{inputs.parameters.pfparam}}" = "fail" ]; then exit 1; else exit 0; fi
4720+
status:
4721+
phase: Failed
4722+
conditions:
4723+
- status: "True"
4724+
type: Completed
4725+
nodes:
4726+
workflow-retry:
4727+
id: workflow-retry
4728+
name: workflow-retry
4729+
displayName: workflow-retry
4730+
type: DAG
4731+
phase: Failed
4732+
templateName: main
4733+
templateScope: local/workflow-retry
4734+
startedAt: "2026-03-24T00:40:00Z"
4735+
finishedAt: "2026-03-24T00:42:32Z"
4736+
children:
4737+
- workflow-retry-2014360908
4738+
outboundNodes:
4739+
- workflow-retry-3976347509
4740+
- workflow-retry-1861571805
4741+
workflow-retry-2014360908:
4742+
id: workflow-retry-2014360908
4743+
name: workflow-retry.launch
4744+
displayName: launch
4745+
type: TaskGroup
4746+
phase: Failed
4747+
boundaryID: workflow-retry
4748+
templateName: passfail
4749+
templateScope: local/workflow-retry
4750+
startedAt: "2026-03-24T00:40:00Z"
4751+
finishedAt: "2026-03-24T00:42:22Z"
4752+
nodeFlag: {}
4753+
children:
4754+
- workflow-retry-816771446
4755+
- workflow-retry-1743784750
4756+
workflow-retry-816771446:
4757+
id: workflow-retry-816771446
4758+
name: workflow-retry.launch(0:pass)
4759+
displayName: launch(0:pass)
4760+
type: Retry
4761+
phase: Succeeded
4762+
boundaryID: workflow-retry
4763+
templateName: passfail
4764+
templateScope: local/workflow-retry
4765+
startedAt: "2026-03-24T00:40:00Z"
4766+
finishedAt: "2026-03-24T00:41:25Z"
4767+
nodeFlag:
4768+
retried: true
4769+
children:
4770+
- workflow-retry-3976347509
4771+
workflow-retry-3976347509:
4772+
id: workflow-retry-3976347509
4773+
name: workflow-retry.launch(0:pass)(0)
4774+
displayName: launch(0:pass)(0)
4775+
type: Pod
4776+
phase: Succeeded
4777+
boundaryID: workflow-retry
4778+
hostNodeName: node1
4779+
templateName: passfail
4780+
templateScope: local/workflow-retry
4781+
startedAt: "2026-03-24T00:40:00Z"
4782+
finishedAt: "2026-03-24T00:41:25Z"
4783+
taskResultSynced: true
4784+
inputs:
4785+
parameters:
4786+
- name: pfparam
4787+
value: pass
4788+
outputs:
4789+
exitCode: "0"
4790+
workflow-retry-1743784750:
4791+
id: workflow-retry-1743784750
4792+
name: workflow-retry.launch(1:fail)
4793+
displayName: launch(1:fail)
4794+
type: Retry
4795+
phase: Failed
4796+
boundaryID: workflow-retry
4797+
templateName: passfail
4798+
templateScope: local/workflow-retry
4799+
startedAt: "2026-03-24T00:40:00Z"
4800+
finishedAt: "2026-03-24T00:42:22Z"
4801+
message: No more retries left
4802+
nodeFlag:
4803+
retried: true
4804+
children:
4805+
- workflow-retry-1861571805
4806+
workflow-retry-1861571805:
4807+
id: workflow-retry-1861571805
4808+
name: workflow-retry.launch(1:fail)(0)
4809+
displayName: launch(1:fail)(0)
4810+
type: Pod
4811+
phase: Failed
4812+
boundaryID: workflow-retry
4813+
hostNodeName: node1
4814+
templateName: passfail
4815+
templateScope: local/workflow-retry
4816+
startedAt: "2026-03-24T00:40:00Z"
4817+
finishedAt: "2026-03-24T00:42:22Z"
4818+
taskResultSynced: true
4819+
inputs:
4820+
parameters:
4821+
- name: pfparam
4822+
value: fail
4823+
outputs:
4824+
exitCode: "1"
4825+
startedAt: "2026-03-24T00:40:00Z"
4826+
finishedAt: "2026-03-24T00:42:32Z"`
4827+
4828+
wf := wfv1.MustUnmarshalWorkflow(workflowYaml)
4829+
4830+
// Retry with parameter override: change wfparams to ["fail"]
4831+
parameters := []string{`wfparams=["fail"]`}
4832+
retryWf, podsToDelete, err := FormulateRetryWorkflow(ctx, wf, false, "", parameters)
4833+
require.NoError(t, err)
4834+
require.NotNil(t, retryWf)
4835+
4836+
// Verify the workflow parameter was overridden
4837+
for _, p := range retryWf.Spec.Arguments.Parameters {
4838+
if p.Name == "wfparams" {
4839+
assert.Equal(t, `["fail"]`, p.GetValue(),
4840+
"wfparams should be overridden to [\"fail\"]")
4841+
}
4842+
}
4843+
4844+
// BUG ASSERTION (issue #15802):
4845+
// The stale node launch(0:pass) from the old parameter expansion should NOT exist
4846+
// in the retry workflow, because the new parameter ["fail"] will expand differently.
4847+
// If it still exists, the controller will create launch(0:fail) alongside the stale
4848+
// launch(0:pass), resulting in two children in the TaskGroup and a stuck workflow.
4849+
_, staleNodeExists := retryWf.Status.Nodes["workflow-retry-816771446"] // launch(0:pass)
4850+
assert.False(t, staleNodeExists,
4851+
"Stale node launch(0:pass) should be deleted when parameter override changes withParam expansion")
4852+
4853+
_, stalePodExists := retryWf.Status.Nodes["workflow-retry-3976347509"] // launch(0:pass)(0)
4854+
assert.False(t, stalePodExists,
4855+
"Stale pod launch(0:pass)(0) should be deleted when parameter override changes withParam expansion")
4856+
4857+
// The failed nodes should be deleted (standard retry behavior)
4858+
_, failedRetryExists := retryWf.Status.Nodes["workflow-retry-1743784750"] // launch(1:fail)
4859+
assert.False(t, failedRetryExists,
4860+
"Failed retry node launch(1:fail) should be deleted")
4861+
4862+
_, failedPodExists := retryWf.Status.Nodes["workflow-retry-1861571805"] // launch(1:fail)(0)
4863+
assert.False(t, failedPodExists,
4864+
"Failed pod launch(1:fail)(0) should be deleted")
4865+
4866+
// The TaskGroup should be reset to Running
4867+
taskGroupNode, ok := retryWf.Status.Nodes["workflow-retry-2014360908"] // launch TaskGroup
4868+
require.True(t, ok, "TaskGroup node should exist")
4869+
assert.Equal(t, wfv1.NodeRunning, taskGroupNode.Phase,
4870+
"TaskGroup should be reset to Running")
4871+
4872+
// The TaskGroup should have NO children (all old expansion nodes should be cleaned up)
4873+
assert.Empty(t, taskGroupNode.Children,
4874+
"TaskGroup children should be empty after parameter override changes the expansion")
4875+
4876+
// The failed pod should be in the deletion list
4877+
assert.Contains(t, podsToDelete, "workflow-retry-passfail-1861571805",
4878+
"Failed pod should be in deletion list")
4879+
4880+
// The succeeded pod from the stale expansion should also be in the deletion list
4881+
assert.Contains(t, podsToDelete, "workflow-retry-passfail-3976347509",
4882+
"Stale succeeded pod should also be in deletion list")
46704883
}

0 commit comments

Comments
 (0)