Skip to content

Commit 0710a2e

Browse files
fix
Signed-off-by: Yaroslav Borbat <yaroslav.borbat@flant.com>
1 parent 520b685 commit 0710a2e

2 files changed

Lines changed: 153 additions & 0 deletions

File tree

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
diff --git a/pkg/virt-launcher/virtwrap/live-migration-source.go b/pkg/virt-launcher/virtwrap/live-migration-source.go
2+
index 066876f0d4..3b5357efc6 100644
3+
--- a/pkg/virt-launcher/virtwrap/live-migration-source.go
4+
+++ b/pkg/virt-launcher/virtwrap/live-migration-source.go
5+
@@ -22,6 +22,7 @@ package virtwrap
6+
import (
7+
"encoding/json"
8+
"encoding/xml"
9+
+ "errors"
10+
"fmt"
11+
"path/filepath"
12+
"strings"
13+
@@ -481,41 +482,49 @@ func (m *migrationMonitor) isMigrationProgressing() bool {
14+
func (m *migrationMonitor) determineNonRunningMigrationStatus(dom cli.VirDomain) *libvirt.DomainJobInfo {
15+
logger := log.Log.Object(m.vmi)
16+
// check if an ongoing migration has been completed before we could capture the outcome
17+
- if m.lastProgressUpdate > m.start {
18+
- logger.Info("Migration job has probably completed before we could capture the status. Getting latest status.")
19+
- // at this point the migration is over, but we don't know the result.
20+
- // check if we were trying to cancel this job. In this case, finalize the migration.
21+
- migration, _ := m.l.metadataCache.Migration.Load()
22+
- if migration.AbortStatus == string(v1.MigrationAbortInProgress) {
23+
- logger.Info("Migration job was canceled")
24+
- return &libvirt.DomainJobInfo{
25+
- Type: libvirt.DOMAIN_JOB_CANCELLED,
26+
- DataRemaining: uint64(m.remainingData),
27+
- DataRemainingSet: true,
28+
- }
29+
- }
30+
+ if !(m.lastProgressUpdate > m.start) {
31+
+ logger.Info("Migration job didn't start yet")
32+
+ return nil
33+
+ }
34+
35+
- // If the domain is active, it means that the migration has failed.
36+
- domainState, _, err := dom.GetState()
37+
- if err != nil {
38+
- logger.Reason(err).Error("failed to get domain state")
39+
- if libvirtError, ok := err.(libvirt.Error); ok &&
40+
- (libvirtError.Code == libvirt.ERR_NO_DOMAIN ||
41+
- libvirtError.Code == libvirt.ERR_OPERATION_INVALID) {
42+
- logger.Info("domain is not running on this node")
43+
- return nil
44+
- }
45+
+ logger.Info("Migration job has probably completed before we could capture the status. Getting latest status.")
46+
+ // at this point the migration is over, but we don't know the result.
47+
+ // check if we were trying to cancel this job. In this case, finalize the migration.
48+
+ migration, _ := m.l.metadataCache.Migration.Load()
49+
+ if migration.AbortStatus == string(v1.MigrationAbortInProgress) {
50+
+ logger.Info("Migration job was canceled")
51+
+ return &libvirt.DomainJobInfo{
52+
+ Type: libvirt.DOMAIN_JOB_CANCELLED,
53+
+ DataRemaining: m.remainingData,
54+
+ DataRemainingSet: true,
55+
}
56+
- if domainState == libvirt.DOMAIN_RUNNING {
57+
- logger.Info("Migration job failed")
58+
+ }
59+
+
60+
+ // If the domain is active, it means that the migration has failed.
61+
+ domainState, _, err := dom.GetState()
62+
+ if err != nil {
63+
+ var libvirtError libvirt.Error
64+
+ if errors.As(err, &libvirtError) &&
65+
+ (libvirtError.Is(libvirt.ERR_NO_DOMAIN) || libvirtError.Is(libvirt.ERR_OPERATION_INVALID)) {
66+
+ logger.Info("Domain is not running on this node")
67+
return &libvirt.DomainJobInfo{
68+
- Type: libvirt.DOMAIN_JOB_FAILED,
69+
- DataRemaining: uint64(m.remainingData),
70+
+ Type: libvirt.DOMAIN_JOB_COMPLETED,
71+
+ DataRemaining: m.remainingData,
72+
DataRemainingSet: true,
73+
}
74+
}
75+
+ logger.Reason(err).Error("Failed to get domain state")
76+
+ return nil
77+
+ }
78+
+ if domainState == libvirt.DOMAIN_RUNNING {
79+
+ logger.Info("Migration job failed")
80+
+ return &libvirt.DomainJobInfo{
81+
+ Type: libvirt.DOMAIN_JOB_FAILED,
82+
+ DataRemaining: m.remainingData,
83+
+ DataRemainingSet: true,
84+
+ }
85+
}
86+
- logger.Info("Migration job didn't start yet")
87+
+
88+
return nil
89+
}
90+
91+
@@ -642,11 +651,10 @@ func (m *migrationMonitor) startMonitor() {
92+
93+
stats := completedJobInfo
94+
if stats == nil {
95+
- stats, err = dom.GetJobStats(0)
96+
- if err != nil {
97+
- logger.Reason(err).Error("failed to get domain job info")
98+
- continue
99+
- }
100+
+ stats = m.getJobStats(dom, logger)
101+
+ }
102+
+ if stats == nil {
103+
+ continue
104+
}
105+
106+
if stats.DataRemainingSet {
107+
@@ -683,6 +691,27 @@ func (m *migrationMonitor) startMonitor() {
108+
}
109+
}
110+
111+
+func (m *migrationMonitor) getJobStats(dom cli.VirDomain, logger *log.FilteredLogger) *libvirt.DomainJobInfo {
112+
+ jobStats, err := dom.GetJobStats(0)
113+
+ if err == nil {
114+
+ return jobStats
115+
+ }
116+
+ var libvirtError libvirt.Error
117+
+ if errors.As(err, &libvirtError) {
118+
+ if libvirtError.Is(libvirt.ERR_NO_DOMAIN) {
119+
+ jobStats = m.determineNonRunningMigrationStatus(dom)
120+
+ return jobStats
121+
+ }
122+
+ if libvirtError.Is(libvirt.ERR_OPERATION_INVALID) {
123+
+ logger.V(6).Reason(err).Info("Failed to get domain job info, will retry")
124+
+ return nil
125+
+ }
126+
+ }
127+
+ logger.Reason(err).Warning("Failed to get domain job info, will retry")
128+
+
129+
+ return nil
130+
+}
131+
+
132+
// logMigrationInfo logs the same migration info as `virsh -r domjobinfo`
133+
func logMigrationInfo(logger *log.FilteredLogger, uid string, info *libvirt.DomainJobInfo) {
134+
bToMiB := func(bytes uint64) uint64 {

images/virt-artifact/patches/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,3 +347,22 @@ AllowAutoConverge is a global configuration option or should be changed in clust
347347
This patch disables setting migration configuration from MigrationPolicies resources,
348348
and forces source virt-handler to wait for migrationConfiguration in KVVMI status before
349349
starting migration.
350+
351+
#### `052-fix-migration-monitor-after-completion.patch`
352+
353+
##### Problem
354+
355+
The migration monitor (migrationMonitor) continues to make calls to GetJobStats() even after a successful migration has completed.
356+
This leads to errors like:
357+
358+
```
359+
"Failed to get domain job info": virError(Code=42, Domain=10, Message='Domain not found...')
360+
```
361+
362+
These errors occur because the domain has already been successfully migrated and cleaned up on the source node.
363+
364+
Additionally, during migration, there are cosmetic errors such as:
365+
366+
```
367+
"Requested operation is not valid: domain is not running"
368+
```

0 commit comments

Comments
 (0)