Skip to content

Commit 51cbffd

Browse files
authored
Merge pull request #58 from cobaltcore-dev/wait-for-connection
Wait for libvirt connection in hypervisor controller
2 parents 21b8e95 + 0739870 commit 51cbffd

2 files changed

Lines changed: 201 additions & 5 deletions

File tree

internal/controller/hypervisor_controller.go

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ type HypervisorReconciler struct {
5858

5959
// Channel that can be used to trigger reconcile events.
6060
reconcileCh chan event.GenericEvent
61+
62+
// An interval that determines how long to wait between connection attempts
63+
// to libvirt. This is used in the Start method when trying to connect to
64+
// libvirt, and can be set to a lower value for testing purposes.
65+
libvirtConnectInterval time.Duration
6166
}
6267

6368
const (
@@ -195,6 +200,9 @@ func (r *HypervisorReconciler) Reconcile(ctx context.Context, req ctrl.Request)
195200
Message: fmt.Sprintf("unable to connect to libvirtd: %v", err),
196201
Reason: "ConnectFailed",
197202
})
203+
// TODO: When libvirt is down, we should also set the overall Ready
204+
// condition to false, because without libvirt connection, we won't be
205+
// able to detect capacity and other scheduling-relevant details.
198206
} else {
199207
// We're connected.
200208
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
@@ -326,10 +334,45 @@ func (r *HypervisorReconciler) Start(ctx context.Context) error {
326334
log := logger.FromContext(ctx, "controller", "hypervisor")
327335
log.Info("starting libvirt event subscription")
328336

329-
// Ensure we're connected to libvirt.
330-
if err := r.Libvirt.Connect(); err != nil {
337+
// Get the hypervisor we will reconcile.
338+
var hypervisor kvmv1.Hypervisor
339+
key := client.ObjectKey{Name: sys.Hostname} // Cluster-scoped
340+
if err := r.Get(ctx, key, &hypervisor); err != nil {
341+
return fmt.Errorf("unable to get hypervisor: %w", err)
342+
}
343+
344+
// Block until we're connected to libvirt.
345+
for {
346+
// Exit if the context is done, e.g. when the manager is shutting down.
347+
if ctx.Err() != nil {
348+
return fmt.Errorf("context done while trying to connect to libvirt: %w", ctx.Err())
349+
}
350+
err := r.Libvirt.Connect()
351+
if err == nil {
352+
log.Info("connected to libvirt")
353+
break // Connected successfully
354+
}
331355
log.Error(err, "unable to connect to libvirt")
332-
return err
356+
// Set the hypervisor's LibVirtType condition to false with the
357+
// error message, so that it's visible in the status.
358+
base := hypervisor.DeepCopy()
359+
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
360+
Type: LibVirtType, // TODO: This should be a kvmv1 condition.
361+
Status: metav1.ConditionFalse,
362+
Message: fmt.Sprintf("unable to connect to libvirt: %v", err),
363+
Reason: "ConnectFailed",
364+
})
365+
patch := client.MergeFromWithOptions(base, client.MergeFromWithOptimisticLock{})
366+
if err := r.Status().Patch(ctx, &hypervisor, patch); err != nil {
367+
log.Error(err, "unable to update hypervisor status after failed libvirt connection")
368+
}
369+
log.Info("updated hypervisor status after failed libvirt connection")
370+
timeToSleep := r.libvirtConnectInterval
371+
if timeToSleep == 0 {
372+
timeToSleep = 5 * time.Second // default value
373+
}
374+
log.Info("retrying libvirt connection after sleeping", "duration", timeToSleep)
375+
time.Sleep(timeToSleep)
333376
}
334377

335378
// Run a ticker which reconciles the hypervisor resource every minute.

internal/controller/hypervisor_controller_test.go

Lines changed: 155 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package controller
2020
import (
2121
"context"
2222
"errors"
23+
"strings"
2324
"time"
2425

2526
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -45,6 +46,26 @@ var _ = Describe("Hypervisor Controller", func() {
4546
Context("When testing Start method", func() {
4647
It("should successfully start and subscribe to libvirt events", func() {
4748
ctx := context.Background()
49+
50+
// Create a hypervisor resource for this test
51+
hypervisorName := "start-success-test-hypervisor"
52+
originalHostname := sys.Hostname
53+
sys.Hostname = hypervisorName
54+
defer func() {
55+
sys.Hostname = originalHostname
56+
}()
57+
58+
hypervisor := &kvmv1.Hypervisor{
59+
ObjectMeta: metav1.ObjectMeta{
60+
Name: hypervisorName,
61+
},
62+
}
63+
Expect(k8sClient.Create(ctx, hypervisor)).To(Succeed())
64+
defer func() {
65+
err := k8sClient.Delete(ctx, hypervisor)
66+
Expect(err).NotTo(HaveOccurred())
67+
}()
68+
4869
eventCallbackCalled := false
4970

5071
controllerReconciler := &HypervisorReconciler{
@@ -68,7 +89,27 @@ var _ = Describe("Hypervisor Controller", func() {
6889
})
6990

7091
It("should fail when libvirt connection fails", func() {
71-
ctx := context.Background()
92+
ctx, cancel := context.WithCancel(context.Background())
93+
defer cancel()
94+
95+
// Create a hypervisor resource for this test
96+
hypervisorName := "start-fail-test-hypervisor"
97+
originalHostname := sys.Hostname
98+
sys.Hostname = hypervisorName
99+
defer func() {
100+
sys.Hostname = originalHostname
101+
}()
102+
103+
hypervisor := &kvmv1.Hypervisor{
104+
ObjectMeta: metav1.ObjectMeta{
105+
Name: hypervisorName,
106+
},
107+
}
108+
Expect(k8sClient.Create(context.Background(), hypervisor)).To(Succeed())
109+
defer func() {
110+
err := k8sClient.Delete(context.Background(), hypervisor)
111+
Expect(err).NotTo(HaveOccurred())
112+
}()
72113

73114
controllerReconciler := &HypervisorReconciler{
74115
Client: k8sClient,
@@ -78,12 +119,124 @@ var _ = Describe("Hypervisor Controller", func() {
78119
return errors.New("connection failed")
79120
},
80121
},
122+
reconcileCh: make(chan event.GenericEvent, 1),
123+
libvirtConnectInterval: 10 * time.Millisecond,
124+
}
125+
126+
// Start runs in a goroutine so we can cancel the context
127+
done := make(chan error, 1)
128+
go func() {
129+
done <- controllerReconciler.Start(ctx)
130+
}()
131+
132+
// Wait for the hypervisor status to reflect the failed libvirt connection
133+
// This must happen BEFORE we cancel the context to ensure the Start method
134+
// had time to attempt connection and update the status
135+
var updatedHypervisor kvmv1.Hypervisor
136+
Eventually(func() bool {
137+
err := k8sClient.Get(context.Background(), types.NamespacedName{Name: hypervisorName}, &updatedHypervisor)
138+
if err != nil {
139+
return false
140+
}
141+
for _, condition := range updatedHypervisor.Status.Conditions {
142+
if condition.Type == "LibVirtConnection" {
143+
return condition.Status == metav1.ConditionFalse &&
144+
condition.Reason == "ConnectFailed" &&
145+
strings.Contains(condition.Message, "connection failed")
146+
}
147+
}
148+
return false
149+
}, 5*time.Second, 100*time.Millisecond).Should(BeTrue(), "hypervisor status should reflect failed libvirt connection")
150+
151+
// Cancel the context to stop the Start method
152+
cancel()
153+
154+
// Wait for Start to return with context cancellation error
155+
select {
156+
case err := <-done:
157+
Expect(err).To(HaveOccurred())
158+
Expect(err.Error()).To(ContainSubstring("context done while trying to connect to libvirt"))
159+
case <-time.After(2 * time.Second):
160+
Fail("timeout waiting for Start to return after context cancellation")
161+
}
162+
})
163+
164+
It("should fail when hypervisor resource does not exist", func() {
165+
ctx := context.Background()
166+
167+
// Set hostname to a non-existent hypervisor
168+
originalHostname := sys.Hostname
169+
sys.Hostname = "non-existent-hypervisor"
170+
defer func() {
171+
sys.Hostname = originalHostname
172+
}()
173+
174+
controllerReconciler := &HypervisorReconciler{
175+
Client: k8sClient,
176+
Scheme: k8sClient.Scheme(),
177+
Libvirt: &libvirt.InterfaceMock{
178+
ConnectFunc: func() error {
179+
return nil
180+
},
181+
},
81182
reconcileCh: make(chan event.GenericEvent, 1),
82183
}
83184

84185
err := controllerReconciler.Start(ctx)
85186
Expect(err).To(HaveOccurred())
86-
Expect(err.Error()).To(ContainSubstring("connection failed"))
187+
Expect(err.Error()).To(ContainSubstring("unable to get hypervisor"))
188+
})
189+
190+
It("should retry libvirt connection and succeed after initial failures", func() {
191+
ctx := context.Background()
192+
193+
// Create a hypervisor resource for this test
194+
hypervisorName := "start-retry-test-hypervisor"
195+
originalHostname := sys.Hostname
196+
sys.Hostname = hypervisorName
197+
defer func() {
198+
sys.Hostname = originalHostname
199+
}()
200+
201+
hypervisor := &kvmv1.Hypervisor{
202+
ObjectMeta: metav1.ObjectMeta{
203+
Name: hypervisorName,
204+
},
205+
}
206+
Expect(k8sClient.Create(ctx, hypervisor)).To(Succeed())
207+
defer func() {
208+
err := k8sClient.Delete(ctx, hypervisor)
209+
Expect(err).NotTo(HaveOccurred())
210+
}()
211+
212+
// Track connection attempts
213+
connectAttempts := 0
214+
eventCallbackCalled := false
215+
216+
controllerReconciler := &HypervisorReconciler{
217+
Client: k8sClient,
218+
Scheme: k8sClient.Scheme(),
219+
Libvirt: &libvirt.InterfaceMock{
220+
ConnectFunc: func() error {
221+
connectAttempts++
222+
// Fail first 2 attempts, succeed on 3rd
223+
if connectAttempts < 3 {
224+
return errors.New("connection failed")
225+
}
226+
return nil
227+
},
228+
WatchDomainChangesFunc: func(eventId golibvirt.DomainEventID, handlerId string, handler func(context.Context, any)) {
229+
eventCallbackCalled = true
230+
},
231+
},
232+
reconcileCh: make(chan event.GenericEvent, 1),
233+
libvirtConnectInterval: 10 * time.Millisecond, // Use short interval for fast test
234+
}
235+
236+
err := controllerReconciler.Start(ctx)
237+
Expect(err).NotTo(HaveOccurred())
238+
Expect(connectAttempts).To(Equal(3))
239+
Expect(eventCallbackCalled).To(BeTrue())
87240
})
88241
})
89242

0 commit comments

Comments
 (0)