Skip to content

Commit fe8e5a9

Browse files
committed
fix(diskpromo): retry PromoteDisks on transient vSphere errors
Transient faults such as ConcurrentAccess indicate a competing vSphere operation was in flight; they are self-resolving and do not require a permanent failure response. Previously, a failed PromoteDisks_Task caused the reconciler to return nil without requeueing, leaving the VM stuck until the task expired from vSphere's RecentTask list (~10 minutes). This change detects transient errors via fault.IsTransientError and continues the RecentTask loop instead of returning, allowing the next reconcile cycle to issue a fresh PromoteDisks_Task immediately. A dedicated condition reason (DiskPromotionTaskTransientError) is introduced to surface the transient-retry state to users. Bumps govmomi to v0.55.0-alpha.0.0.20260518191903-48ab34adb211 to include vmware/govmomi#4016, which classifies ConcurrentAccess as a transient error in fault.IsTransientError.
1 parent 1b4245a commit fe8e5a9

6 files changed

Lines changed: 126 additions & 13 deletions

File tree

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ require (
4949
github.com/vmware-tanzu/image-registry-operator-api v0.0.0-20250813160346-0f6259af5cbb
5050
github.com/vmware-tanzu/net-operator-api v0.0.0-20260501221253-4950cf50cd6b
5151
github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20260423081355-beab2417344a
52-
github.com/vmware/govmomi v0.54.0
52+
github.com/vmware/govmomi v0.55.0-alpha.0.0.20260518191903-48ab34adb211
5353
golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93
5454
golang.org/x/net v0.53.0 // indirect
5555
// * https://github.com/vmware-tanzu/vm-operator/security/dependabot/24

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ github.com/vmware-tanzu/net-operator-api v0.0.0-20260501221253-4950cf50cd6b h1:C
160160
github.com/vmware-tanzu/net-operator-api v0.0.0-20260501221253-4950cf50cd6b/go.mod h1:w6QJGm3crIA16ZIz1FVQXD2NVeJhOgGXxW05RbVTSTo=
161161
github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20260423081355-beab2417344a h1:yqGxhqSJ78veQjdOHINJLE9IWDcreMTzwDsOAdwrUWM=
162162
github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20260423081355-beab2417344a/go.mod h1:Q4JzNkNMvjo7pXtlB5/R3oME4Nhah7fAObWgghVmtxk=
163-
github.com/vmware/govmomi v0.54.0 h1:akEKkM9XKMOhTskmdzTLG8JzH+sh61jbFrVPbAzv5IQ=
164-
github.com/vmware/govmomi v0.54.0/go.mod h1:0F3hChqXDrSQQnjfSiCqRE5lPD4aZlbOtKG4uroq2a4=
163+
github.com/vmware/govmomi v0.55.0-alpha.0.0.20260518191903-48ab34adb211 h1:n8hoHi/26x5GaTKTS04PqC7bNrCh7Wa7Eh44RKTM214=
164+
github.com/vmware/govmomi v0.55.0-alpha.0.0.20260518191903-48ab34adb211/go.mod h1:0F3hChqXDrSQQnjfSiCqRE5lPD4aZlbOtKG4uroq2a4=
165165
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
166166
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
167167
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=

pkg/vmconfig/diskpromo/diskpromo_reconciler.go

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"context"
99
"fmt"
1010

11+
"github.com/vmware/govmomi/fault"
1112
"github.com/vmware/govmomi/object"
1213
"github.com/vmware/govmomi/vim25"
1314
"github.com/vmware/govmomi/vim25/mo"
@@ -29,9 +30,10 @@ type reconciler struct{}
2930
var _ vmconfig.Reconciler = reconciler{}
3031

3132
const (
32-
ReasonTaskError = "DiskPromotionTaskError"
33-
ReasonPending = "DiskPromotionPending"
34-
ReasonRunning = "DiskPromotionRunning"
33+
ReasonTaskError = "DiskPromotionTaskError"
34+
ReasonTaskTransientError = "DiskPromotionTaskTransientError"
35+
ReasonPending = "DiskPromotionPending"
36+
ReasonRunning = "DiskPromotionRunning"
3537

3638
PromoteDisksTaskKey = "VirtualMachine.promoteDisks"
3739
)
@@ -124,6 +126,37 @@ func (r reconciler) Reconcile(
124126
switch t.State {
125127

126128
case vimtypes.TaskInfoStateError:
129+
// A transient fault (e.g. ConcurrentAccess) means a competing
130+
// vSphere operation was in flight when this task was attempted.
131+
// The fault is self-resolving once that operation finishes —
132+
// no operator action is required.
133+
//
134+
// Continuing the loop instead of returning allows the next
135+
// reconcile to issue a fresh PromoteDisks task without waiting
136+
// for the errored task to expire from RecentTask (~10 minutes).
137+
// Two invariants ensure this does not create duplicate tasks:
138+
//
139+
// 1. If a PromoteDisks task is still running, the loop
140+
// returns early above before reaching obj.PromoteDisks.
141+
// 2. If the competing operation is still running, the
142+
// runningTaskInfo guard below blocks obj.PromoteDisks
143+
// until it completes.
144+
//
145+
// Ordering in RecentTask is also not a concern. All transient-
146+
// errored tasks are skipped regardless of how many exist or
147+
// where they appear. Any running or successful PromoteDisks
148+
// task encountered anywhere in the loop still returns early
149+
// and takes precedence.
150+
if fault.IsTransientError(t.Error) {
151+
pkgcond.MarkFalse(
152+
vm,
153+
vmopv1.VirtualMachineDiskPromotionSynced,
154+
ReasonTaskTransientError,
155+
"%s",
156+
t.Error.LocalizedMessage)
157+
continue
158+
}
159+
127160
pkgcond.MarkFalse(
128161
vm,
129162
vmopv1.VirtualMachineDiskPromotionSynced,

pkg/vmconfig/diskpromo/diskpromo_reconciler_test.go

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -476,26 +476,106 @@ var _ = Describe("Reconcile", Label(testlabels.V1Alpha5), func() {
476476
})
477477
})
478478

479-
When("there promote disks is called while already running", func() {
479+
When("there promote disks is called while already running", func() {
480+
BeforeEach(func() {
481+
ctx = pkgctx.WithVMRecentTasks(ctx, []vimtypes.TaskInfo{
482+
{
483+
State: vimtypes.TaskInfoStateRunning,
484+
DescriptionId: diskpromo.PromoteDisksTaskKey,
485+
},
486+
})
487+
})
488+
It("should mark the condition as running", func() {
489+
Expect(err).ToNot(HaveOccurred())
490+
c := conditions.Get(vm, vmopv1.VirtualMachineDiskPromotionSynced)
491+
Expect(c).ToNot(BeNil())
492+
Expect(c.Status).To(Equal(metav1.ConditionFalse))
493+
Expect(c.Reason).To(Equal(diskpromo.ReasonRunning))
494+
Expect(c.Message).To(Equal("Promotion is running"))
495+
})
496+
})
497+
498+
When("a previous promote disks task failed with a transient error", func() {
499+
transientTaskInfo := vimtypes.TaskInfo{
500+
State: vimtypes.TaskInfoStateError,
501+
DescriptionId: diskpromo.PromoteDisksTaskKey,
502+
Error: &vimtypes.LocalizedMethodFault{
503+
Fault: &vimtypes.ConcurrentAccess{},
504+
LocalizedMessage: "concurrent access",
505+
},
506+
}
507+
508+
When("no other tasks are running", func() {
509+
BeforeEach(func() {
510+
ctx = pkgctx.WithVMRecentTasks(ctx, []vimtypes.TaskInfo{
511+
transientTaskInfo,
512+
})
513+
})
514+
It("should issue a new promote disks task", func() {
515+
Expect(err).To(MatchError(diskpromo.ErrPromoteDisks))
516+
c := conditions.Get(vm, vmopv1.VirtualMachineDiskPromotionSynced)
517+
Expect(c).ToNot(BeNil())
518+
Expect(c.Status).To(Equal(metav1.ConditionFalse))
519+
Expect(c.Reason).To(Equal(diskpromo.ReasonRunning))
520+
})
521+
})
522+
523+
When("a competing task is still running", func() {
524+
BeforeEach(func() {
525+
ctx = pkgctx.WithVMRecentTasks(ctx, []vimtypes.TaskInfo{
526+
transientTaskInfo,
527+
{
528+
State: vimtypes.TaskInfoStateRunning,
529+
DescriptionId: "fake.concurrent.task",
530+
},
531+
})
532+
})
533+
It("should wait and mark pending", func() {
534+
Expect(err).ToNot(HaveOccurred())
535+
c := conditions.Get(vm, vmopv1.VirtualMachineDiskPromotionSynced)
536+
Expect(c).ToNot(BeNil())
537+
Expect(c.Status).To(Equal(metav1.ConditionFalse))
538+
Expect(c.Reason).To(Equal(diskpromo.ReasonPending))
539+
})
540+
})
541+
542+
When("a promote disks task is already running", func() {
480543
BeforeEach(func() {
481544
ctx = pkgctx.WithVMRecentTasks(ctx, []vimtypes.TaskInfo{
545+
transientTaskInfo,
482546
{
483547
State: vimtypes.TaskInfoStateRunning,
484548
DescriptionId: diskpromo.PromoteDisksTaskKey,
485549
},
486550
})
487551
})
488-
It("should mark the condition as running", func() {
552+
It("should wait and mark running", func() {
489553
Expect(err).ToNot(HaveOccurred())
490554
c := conditions.Get(vm, vmopv1.VirtualMachineDiskPromotionSynced)
491555
Expect(c).ToNot(BeNil())
492556
Expect(c.Status).To(Equal(metav1.ConditionFalse))
493557
Expect(c.Reason).To(Equal(diskpromo.ReasonRunning))
494-
Expect(c.Message).To(Equal("Promotion is running"))
495558
})
496559
})
497560

498-
When("VM has no child disks and no existing condition", func() {
561+
When("multiple transient errors are present", func() {
562+
BeforeEach(func() {
563+
ctx = pkgctx.WithVMRecentTasks(ctx, []vimtypes.TaskInfo{
564+
transientTaskInfo,
565+
transientTaskInfo,
566+
})
567+
})
568+
It("should issue a new promote disks task", func() {
569+
Expect(err).To(MatchError(diskpromo.ErrPromoteDisks))
570+
c := conditions.Get(vm, vmopv1.VirtualMachineDiskPromotionSynced)
571+
Expect(c).ToNot(BeNil())
572+
Expect(c.Status).To(Equal(metav1.ConditionFalse))
573+
Expect(c.Reason).To(Equal(diskpromo.ReasonRunning))
574+
})
575+
})
576+
})
577+
578+
When("VM has no child disks and no existing condition", func() {
499579
BeforeEach(func() {
500580
// Remove all child disks
501581
moVM.Config.Hardware.Device = nil

test/e2e/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ require (
6666
github.com/vmware-tanzu/vm-operator/external/tanzu-topology v0.0.0-00010101000000-000000000000
6767
github.com/vmware-tanzu/vm-operator/external/vsphere-csi-driver v0.0.0-00010101000000-000000000000
6868
github.com/vmware-tanzu/vm-operator/pkg/backup/api v0.0.0-00010101000000-000000000000
69-
github.com/vmware/govmomi v0.54.0
69+
github.com/vmware/govmomi v0.55.0-alpha.0.0.20260518191903-48ab34adb211
7070
golang.org/x/crypto v0.50.0
7171
gopkg.in/yaml.v2 v2.4.0
7272
gopkg.in/yaml.v3 v3.0.1

test/e2e/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,8 @@ github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
218218
github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
219219
github.com/vmware-tanzu/net-operator-api v0.0.0-20260501221253-4950cf50cd6b h1:Cjg5Iy0Lusv81iISuxHZuikSfYSy1SeePu7iqH214BY=
220220
github.com/vmware-tanzu/net-operator-api v0.0.0-20260501221253-4950cf50cd6b/go.mod h1:w6QJGm3crIA16ZIz1FVQXD2NVeJhOgGXxW05RbVTSTo=
221-
github.com/vmware/govmomi v0.54.0 h1:akEKkM9XKMOhTskmdzTLG8JzH+sh61jbFrVPbAzv5IQ=
222-
github.com/vmware/govmomi v0.54.0/go.mod h1:0F3hChqXDrSQQnjfSiCqRE5lPD4aZlbOtKG4uroq2a4=
221+
github.com/vmware/govmomi v0.55.0-alpha.0.0.20260518191903-48ab34adb211 h1:n8hoHi/26x5GaTKTS04PqC7bNrCh7Wa7Eh44RKTM214=
222+
github.com/vmware/govmomi v0.55.0-alpha.0.0.20260518191903-48ab34adb211/go.mod h1:0F3hChqXDrSQQnjfSiCqRE5lPD4aZlbOtKG4uroq2a4=
223223
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
224224
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
225225
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=

0 commit comments

Comments
 (0)