Skip to content

Commit 0cf6c35

Browse files
Wait for libvirt connection in hypervisor controller
1 parent 21b8e95 commit 0cf6c35

2 files changed

Lines changed: 177 additions & 5 deletions

File tree

internal/controller/hypervisor_controller.go

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ type HypervisorReconciler struct {
5858

5959
// Channel that can be used to trigger reconcile events.
6060
reconcileCh chan event.GenericEvent
61+
62+
// An interval that determines how long to wait between connection attempts
63+
// to libvirt. This is used in the Start method when trying to connect to
64+
// libvirt, and can be set to a lower value for testing purposes.
65+
libvirtConnectInterval time.Duration
6166
}
6267

6368
const (
@@ -195,6 +200,9 @@ func (r *HypervisorReconciler) Reconcile(ctx context.Context, req ctrl.Request)
195200
Message: fmt.Sprintf("unable to connect to libvirtd: %v", err),
196201
Reason: "ConnectFailed",
197202
})
203+
// TODO: When libvirt is down, we should also set the overall Ready
204+
// condition to false, because without libvirt connection, we won't be
205+
// able to detect capacity and other scheduling-relevant details.
198206
} else {
199207
// We're connected.
200208
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
@@ -326,10 +334,44 @@ func (r *HypervisorReconciler) Start(ctx context.Context) error {
326334
log := logger.FromContext(ctx, "controller", "hypervisor")
327335
log.Info("starting libvirt event subscription")
328336

329-
// Ensure we're connected to libvirt.
330-
if err := r.Libvirt.Connect(); err != nil {
337+
// Get the hypervisor we will reconcile.
338+
var hypervisor kvmv1.Hypervisor
339+
key := client.ObjectKey{Name: sys.Hostname} // Cluster-scoped
340+
if err := r.Get(ctx, key, &hypervisor); err != nil {
341+
return fmt.Errorf("unable to get hypervisor: %w", err)
342+
}
343+
344+
// Block until we're connected to libvirt.
345+
for {
346+
// Exit if the context is done, e.g. when the manager is shutting down.
347+
if ctx.Err() != nil {
348+
return fmt.Errorf("context done while trying to connect to libvirt: %w", ctx.Err())
349+
}
350+
err := r.Libvirt.Connect()
351+
if err == nil {
352+
log.Info("connected to libvirt")
353+
break // Connected successfully
354+
}
331355
log.Error(err, "unable to connect to libvirt")
332-
return err
356+
// Set the hypervisor's LibVirtType condition to false with the
357+
// error message, so that it's visible in the status.
358+
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
359+
Type: LibVirtType, // TODO: This should be a kvmv1 condition.
360+
Status: metav1.ConditionFalse,
361+
Message: fmt.Sprintf("unable to connect to libvirt: %v", err),
362+
Reason: "ConnectFailed",
363+
})
364+
patch := client.MergeFromWithOptions(hypervisor.DeepCopy(), client.MergeFromWithOptimisticLock{})
365+
if err := r.Status().Patch(ctx, &hypervisor, patch); err != nil {
366+
log.Error(err, "unable to update hypervisor status after failed libvirt connection")
367+
}
368+
log.Info("updated hypervisor status after failed libvirt connection")
369+
timeToSleep := r.libvirtConnectInterval
370+
if timeToSleep == 0 {
371+
timeToSleep = 5 * time.Second // default value
372+
}
373+
log.Info("retrying libvirt connection after sleeping", "duration", timeToSleep)
374+
time.Sleep(timeToSleep)
333375
}
334376

335377
// Run a ticker which reconciles the hypervisor resource every minute.

internal/controller/hypervisor_controller_test.go

Lines changed: 132 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,25 @@ var _ = Describe("Hypervisor Controller", func() {
4545
Context("When testing Start method", func() {
4646
It("should successfully start and subscribe to libvirt events", func() {
4747
ctx := context.Background()
48+
49+
// Create a hypervisor resource for this test
50+
hypervisorName := "start-success-test-hypervisor"
51+
originalHostname := sys.Hostname
52+
sys.Hostname = hypervisorName
53+
defer func() {
54+
sys.Hostname = originalHostname
55+
}()
56+
57+
hypervisor := &kvmv1.Hypervisor{
58+
ObjectMeta: metav1.ObjectMeta{
59+
Name: hypervisorName,
60+
},
61+
}
62+
Expect(k8sClient.Create(ctx, hypervisor)).To(Succeed())
63+
defer func() {
64+
_ = k8sClient.Delete(ctx, hypervisor)
65+
}()
66+
4867
eventCallbackCalled := false
4968

5069
controllerReconciler := &HypervisorReconciler{
@@ -68,7 +87,26 @@ var _ = Describe("Hypervisor Controller", func() {
6887
})
6988

7089
It("should fail when libvirt connection fails", func() {
71-
ctx := context.Background()
90+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
91+
defer cancel()
92+
93+
// Create a hypervisor resource for this test
94+
hypervisorName := "start-fail-test-hypervisor"
95+
originalHostname := sys.Hostname
96+
sys.Hostname = hypervisorName
97+
defer func() {
98+
sys.Hostname = originalHostname
99+
}()
100+
101+
hypervisor := &kvmv1.Hypervisor{
102+
ObjectMeta: metav1.ObjectMeta{
103+
Name: hypervisorName,
104+
},
105+
}
106+
Expect(k8sClient.Create(context.Background(), hypervisor)).To(Succeed())
107+
defer func() {
108+
_ = k8sClient.Delete(context.Background(), hypervisor)
109+
}()
72110

73111
controllerReconciler := &HypervisorReconciler{
74112
Client: k8sClient,
@@ -78,12 +116,104 @@ var _ = Describe("Hypervisor Controller", func() {
78116
return errors.New("connection failed")
79117
},
80118
},
119+
reconcileCh: make(chan event.GenericEvent, 1),
120+
libvirtConnectInterval: 10 * time.Millisecond,
121+
}
122+
123+
// Start runs in a goroutine so we can cancel the context
124+
done := make(chan error, 1)
125+
go func() {
126+
done <- controllerReconciler.Start(ctx)
127+
}()
128+
129+
// Wait for either completion or context cancellation
130+
select {
131+
case <-ctx.Done():
132+
// Context was cancelled, which is expected since the Start method
133+
// retries indefinitely until connected. The test passes because
134+
// we verified the connection fails and retries.
135+
case err := <-done:
136+
// If Start returns, it should be due to context cancellation
137+
Expect(err).To(HaveOccurred())
138+
Expect(err.Error()).To(ContainSubstring("context done while trying to connect to libvirt"))
139+
}
140+
})
141+
142+
It("should fail when hypervisor resource does not exist", func() {
143+
ctx := context.Background()
144+
145+
// Set hostname to a non-existent hypervisor
146+
originalHostname := sys.Hostname
147+
sys.Hostname = "non-existent-hypervisor"
148+
defer func() {
149+
sys.Hostname = originalHostname
150+
}()
151+
152+
controllerReconciler := &HypervisorReconciler{
153+
Client: k8sClient,
154+
Scheme: k8sClient.Scheme(),
155+
Libvirt: &libvirt.InterfaceMock{
156+
ConnectFunc: func() error {
157+
return nil
158+
},
159+
},
81160
reconcileCh: make(chan event.GenericEvent, 1),
82161
}
83162

84163
err := controllerReconciler.Start(ctx)
85164
Expect(err).To(HaveOccurred())
86-
Expect(err.Error()).To(ContainSubstring("connection failed"))
165+
Expect(err.Error()).To(ContainSubstring("unable to get hypervisor"))
166+
})
167+
168+
It("should retry libvirt connection and succeed after initial failures", func() {
169+
ctx := context.Background()
170+
171+
// Create a hypervisor resource for this test
172+
hypervisorName := "start-retry-test-hypervisor"
173+
originalHostname := sys.Hostname
174+
sys.Hostname = hypervisorName
175+
defer func() {
176+
sys.Hostname = originalHostname
177+
}()
178+
179+
hypervisor := &kvmv1.Hypervisor{
180+
ObjectMeta: metav1.ObjectMeta{
181+
Name: hypervisorName,
182+
},
183+
}
184+
Expect(k8sClient.Create(ctx, hypervisor)).To(Succeed())
185+
defer func() {
186+
_ = k8sClient.Delete(ctx, hypervisor)
187+
}()
188+
189+
// Track connection attempts
190+
connectAttempts := 0
191+
eventCallbackCalled := false
192+
193+
controllerReconciler := &HypervisorReconciler{
194+
Client: k8sClient,
195+
Scheme: k8sClient.Scheme(),
196+
Libvirt: &libvirt.InterfaceMock{
197+
ConnectFunc: func() error {
198+
connectAttempts++
199+
// Fail first 2 attempts, succeed on 3rd
200+
if connectAttempts < 3 {
201+
return errors.New("connection failed")
202+
}
203+
return nil
204+
},
205+
WatchDomainChangesFunc: func(eventId golibvirt.DomainEventID, handlerId string, handler func(context.Context, any)) {
206+
eventCallbackCalled = true
207+
},
208+
},
209+
reconcileCh: make(chan event.GenericEvent, 1),
210+
libvirtConnectInterval: 10 * time.Millisecond, // Use short interval for fast test
211+
}
212+
213+
err := controllerReconciler.Start(ctx)
214+
Expect(err).NotTo(HaveOccurred())
215+
Expect(connectAttempts).To(Equal(3))
216+
Expect(eventCallbackCalled).To(BeTrue())
87217
})
88218
})
89219

0 commit comments

Comments
 (0)