Skip to content

Commit d0743f6

Browse files
committed
kola: Add soft-reboot support for external tests
Implements soft-reboot capabilities for Kola, it enables tests to use systemd's soft-reboot functionality. The implementation follows the same pattern as regular reboots but for `systemctl soft-reboot`, tracks systemd boot timestamps rather than kernel boot IDs for state detection.
1 parent 083029f commit d0743f6

16 files changed

Lines changed: 349 additions & 19 deletions

File tree

docs/kola/external-tests.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,36 @@ echo "ok autopkgtest rebooting"
102102

103103
This will trigger the monitoring `kola` process to invoke a reboot.
104104

105+
## Support for soft-rebooting
106+
107+
Kola also supports soft-rebooting using systemd's `systemctl soft-reboot` command.
108+
Soft-reboot restarts the userspace while keeping the kernel and hardware state intact.
109+
This is useful for testing userspace updates without a full system reboot.
110+
111+
The soft-reboot API is similar to the regular reboot API:
112+
113+
```
114+
#!/bin/bash
115+
# Example of soft-reboot test
116+
set -xeuo pipefail
117+
case "${AUTOPKGTEST_REBOOT_MARK:-}" in
118+
"") echo "test beginning"; /tmp/autopkgtest-soft-reboot mark1 ;;
119+
mark1) echo "test in mark1"; /tmp/autopkgtest-soft-reboot mark2 ;;
120+
mark2) echo "test in mark2" ;;
121+
*) echo "unexpected mark: ${AUTOPKGTEST_REBOOT_MARK}"; exit 1;;
122+
esac
123+
echo "ok autopkgtest soft-rebooting"
124+
```
125+
126+
Key differences with soft-reboot:
127+
- The kernel boot ID (`/proc/sys/kernel/random/boot_id`) remains the same
128+
- Hardware state and kernel memory are preserved
129+
- Only userspace is restarted
130+
- Uses `systemctl soft-reboot` instead of `reboot`
131+
132+
Both `/tmp/autopkgtest-soft-reboot` and `/tmp/autopkgtest-soft-reboot-prepare` scripts are available,
133+
analogous to their regular reboot counterparts.
134+
105135
The rationale for this is that it helps kola to know when a reboot is happening
106136
so that it can correctly follow the state of the systemd journal, etc. A future
107137
enhancement will support directly invoking `reboot` and having kola just figure

mantle/cmd/kola/devshell.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,8 @@ func runDevShellSSH(ctx context.Context, builder *platform.QemuBuilder, conf *co
239239
_ = inst.Kill()
240240
case guestStateInReboot:
241241
statusMsg = "QEMU guest initiated reboot"
242+
case guestStateInSoftReboot:
243+
statusMsg = "QEMU guest initiated soft-reboot"
242244
case guestStateOpenSshStopped:
243245
statusMsg = "QEMU openssh is not listening"
244246
case guestStateSshDisconnected:
@@ -285,6 +287,8 @@ const (
285287
guestStateInShutdown
286288
// guestStateInReboot indicates that the guest has started a reboot
287289
guestStateInReboot
290+
// guestStateInSoftReboot indicates that the guest has started a soft-reboot
291+
guestStateInSoftReboot
288292
// guestStateHalted indicates that the guest has halted or shutdown
289293
guestStateHalted
290294
// guestStateBooting indicates that the instance is in early boot
@@ -325,6 +329,9 @@ func checkWriteState(msg string, c chan<- guestState) {
325329
if strings.Contains(msg, "Starting Reboot...") {
326330
c <- guestStateInReboot
327331
}
332+
if strings.Contains(msg, "Reached target soft-reboot") {
333+
c <- guestStateInSoftReboot
334+
}
328335
}
329336

330337
type systemdEventMessage struct {
@@ -428,6 +435,11 @@ func watchJournal(builder *platform.QemuBuilder, conf *conf.Conf, stateChan chan
428435
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
429436
guestState: guestStateInShutdown,
430437
},
438+
{
439+
unit: "systemd-soft-reboot.service",
440+
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
441+
guestState: guestStateInSoftReboot,
442+
},
431443
}
432444

433445
r, err := builder.VirtioJournal(conf, "-o json --system")

mantle/cmd/kolet/kolet.go

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,25 @@ reboot
105105
autopkgtestRebootPrepareScript = `#!/bin/bash
106106
set -euo pipefail
107107
exec /usr/local/bin/kolet reboot-request "$1"
108+
`
109+
110+
// Soft-reboot support
111+
autopkgTestSoftRebootPath = "/tmp/autopkgtest-soft-reboot"
112+
autopkgtestSoftRebootScript = `#!/bin/bash
113+
set -xeuo pipefail
114+
/usr/local/bin/kolet soft-reboot-request "$1"
115+
systemctl soft-reboot
116+
`
117+
autopkgTestSoftRebootPreparePath = "/tmp/autopkgtest-soft-reboot-prepare"
118+
119+
autopkgtestSoftRebootPrepareScript = `#!/bin/bash
120+
set -euo pipefail
121+
exec /usr/local/bin/kolet soft-reboot-request "$1"
108122
`
109123

110124
// File used to communicate between the script and the kolet runner internally
111-
rebootRequestFifo = "/run/kolet-reboot"
125+
rebootRequestFifo = "/run/kolet-reboot"
126+
softRebootRequestFifo = "/run/kolet-soft-reboot"
112127
)
113128

114129
var (
@@ -140,6 +155,13 @@ var (
140155
SilenceUsage: true,
141156
}
142157

158+
cmdSoftReboot = &cobra.Command{
159+
Use: "soft-reboot-request MARK",
160+
Short: "Request a soft reboot",
161+
RunE: runSoftReboot,
162+
SilenceUsage: true,
163+
}
164+
143165
cmdHttpd = &cobra.Command{
144166
Use: "httpd",
145167
Short: "Start an HTTP server to serve the contents of the file system",
@@ -259,6 +281,20 @@ func initiateReboot(mark string) error {
259281
return nil
260282
}
261283

284+
func initiateSoftReboot(mark string) error {
285+
systemdjournal.Print(systemdjournal.PriInfo, "Processing soft-reboot request")
286+
res := kola.KoletResult{
287+
SoftReboot: string(mark),
288+
}
289+
buf, err := json.Marshal(&res)
290+
if err != nil {
291+
return errors.Wrapf(err, "serializing KoletResult")
292+
}
293+
fmt.Println(string(buf))
294+
systemdjournal.Print(systemdjournal.PriInfo, "Acknowledged soft-reboot request with mark: %s", buf)
295+
return nil
296+
}
297+
262298
func runExtUnit(cmd *cobra.Command, args []string) error {
263299
rebootOff, _ := cmd.Flags().GetBool("deny-reboots")
264300
// Write the autopkgtest wrappers
@@ -268,10 +304,18 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
268304
if err := os.WriteFile(autopkgTestRebootPreparePath, []byte(autopkgtestRebootPrepareScript), 0755); err != nil {
269305
return err
270306
}
307+
// Write the soft-reboot autopkgtest wrappers
308+
if err := os.WriteFile(autopkgTestSoftRebootPath, []byte(autopkgtestSoftRebootScript), 0755); err != nil {
309+
return err
310+
}
311+
if err := os.WriteFile(autopkgTestSoftRebootPreparePath, []byte(autopkgtestSoftRebootPrepareScript), 0755); err != nil {
312+
return err
313+
}
271314

272315
// Create the reboot cmdline -> login FIFO for the reboot mark and
273316
// proxy it into a channel
274317
rebootChan := make(chan string)
318+
softRebootChan := make(chan string)
275319
errChan := make(chan error)
276320

277321
// We want to prevent certain tests (like non-exclusive tests) from rebooting
@@ -293,6 +337,25 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
293337
}
294338
rebootChan <- string(buf)
295339
}()
340+
341+
// Create soft-reboot FIFO and channel
342+
err = exec.Command("mkfifo", softRebootRequestFifo).Run()
343+
if err != nil {
344+
return err
345+
}
346+
go func() {
347+
softRebootReader, err := os.Open(softRebootRequestFifo)
348+
if err != nil {
349+
errChan <- err
350+
return
351+
}
352+
defer softRebootReader.Close()
353+
buf, err := io.ReadAll(softRebootReader)
354+
if err != nil {
355+
errChan <- err
356+
}
357+
softRebootChan <- string(buf)
358+
}()
296359
}
297360

298361
ctx := context.Background()
@@ -334,6 +397,8 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
334397
return err
335398
case reboot := <-rebootChan:
336399
return initiateReboot(reboot)
400+
case softReboot := <-softRebootChan:
401+
return initiateSoftReboot(softReboot)
337402
case m := <-unitevents:
338403
for n := range m {
339404
if n == unitname {
@@ -387,6 +452,35 @@ func runReboot(cmd *cobra.Command, args []string) error {
387452
return nil
388453
}
389454

455+
// runSoftReboot handles soft-reboot requests similar to runReboot but for systemctl soft-reboot
456+
func runSoftReboot(cmd *cobra.Command, args []string) error {
457+
if _, err := os.Stat(softRebootRequestFifo); os.IsNotExist(err) {
458+
return errors.New("Soft-reboots are not supported for this test, softRebootRequestFifo does not exist.")
459+
}
460+
461+
mark := args[0]
462+
systemdjournal.Print(systemdjournal.PriInfo, "Requesting soft-reboot with mark: %s", mark)
463+
err := exec.Command("mkfifo", kola.KoletRebootAckFifo).Run()
464+
if err != nil {
465+
return err
466+
}
467+
err = os.WriteFile(softRebootRequestFifo, []byte(mark), 0644)
468+
if err != nil {
469+
return err
470+
}
471+
f, err := os.Open(kola.KoletRebootAckFifo)
472+
if err != nil {
473+
return err
474+
}
475+
buf := make([]byte, 1)
476+
_, err = f.Read(buf)
477+
if err != nil {
478+
return err
479+
}
480+
systemdjournal.Print(systemdjournal.PriInfo, "Soft-reboot request acknowledged")
481+
return nil
482+
}
483+
390484
func runHttpd(cmd *cobra.Command, args []string) error {
391485
port, _ := cmd.Flags().GetString("port")
392486
path, _ := cmd.Flags().GetString("path")
@@ -403,6 +497,8 @@ func main() {
403497
root.AddCommand(cmdRunExtUnit)
404498
cmdReboot.Args = cobra.ExactArgs(1)
405499
root.AddCommand(cmdReboot)
500+
cmdSoftReboot.Args = cobra.ExactArgs(1)
501+
root.AddCommand(cmdSoftReboot)
406502
cmdHttpd.Flags().StringP("port", "", "80", "port")
407503
cmdHttpd.Flags().StringP("path", "", "./", "path to filesystem contents to serve")
408504
cmdHttpd.Args = cobra.ExactArgs(0)

mantle/kola/harness.go

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ const (
255255

256256
// KoletResult is serialized JSON passed from kolet to the harness
257257
type KoletResult struct {
258-
Reboot string
258+
Reboot string
259+
SoftReboot string
259260
}
260261

261262
const KoletExtTestUnit = "kola-runext"
@@ -1106,6 +1107,10 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
11061107
if err != nil {
11071108
return errors.Wrapf(err, "getting boot id")
11081109
}
1110+
userspaceTimestamp, err := platform.GetMachineBootCount(mach)
1111+
if err != nil {
1112+
return errors.Wrapf(err, "getting userspace timestamp")
1113+
}
11091114
plog.Debug("Starting kolet run-test-unit")
11101115
if previousRebootState != "" {
11111116
// quote around the value for systemd
@@ -1138,27 +1143,47 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
11381143
return errors.Wrapf(err, "parsing kolet json %s", string(stdout))
11391144
}
11401145
}
1141-
// If no reboot is requested, we're done
1142-
if koletRes.Reboot == "" {
1146+
// If no reboot or soft-reboot is requested, we're done
1147+
if koletRes.Reboot == "" && koletRes.SoftReboot == "" {
11431148
return nil
11441149
}
11451150

1146-
// A reboot is requested
1147-
previousRebootState = koletRes.Reboot
1148-
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
1149-
// This signals to the subject that we have saved the mark, and the subject
1150-
// can proceed with rebooting. We stop sshd to ensure that the wait below
1151-
// doesn't log in while ssh is shutting down.
1152-
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
1153-
if err != nil {
1154-
return errors.Wrapf(err, "failed to acknowledge reboot")
1151+
// Handle regular reboot
1152+
if koletRes.Reboot != "" {
1153+
previousRebootState = koletRes.Reboot
1154+
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
1155+
// This signals to the subject that we have saved the mark, and the subject
1156+
// can proceed with rebooting. We stop sshd to ensure that the wait below
1157+
// doesn't log in while ssh is shutting down.
1158+
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
1159+
if err != nil {
1160+
return errors.Wrapf(err, "failed to acknowledge reboot")
1161+
}
1162+
plog.Debug("Waiting for reboot")
1163+
err = mach.WaitForReboot(120*time.Second, bootID)
1164+
if err != nil {
1165+
return errors.Wrapf(err, "Waiting for reboot")
1166+
}
1167+
plog.Debug("Reboot complete")
11551168
}
1156-
plog.Debug("Waiting for reboot")
1157-
err = mach.WaitForReboot(120*time.Second, bootID)
1158-
if err != nil {
1159-
return errors.Wrapf(err, "Waiting for reboot")
1169+
1170+
// Handle soft-reboot
1171+
if koletRes.SoftReboot != "" {
1172+
previousRebootState = koletRes.SoftReboot
1173+
plog.Debugf("Soft-reboot request with mark='%s'", previousRebootState)
1174+
// Use the userspace timestamp we collected at the beginning of this loop iteration
1175+
// Acknowledge the soft-reboot request
1176+
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'echo > %s'", KoletRebootAckFifo))
1177+
if err != nil {
1178+
return errors.Wrapf(err, "failed to acknowledge soft-reboot")
1179+
}
1180+
plog.Debug("Waiting for soft-reboot")
1181+
err = mach.WaitForSoftReboot(120*time.Second, userspaceTimestamp)
1182+
if err != nil {
1183+
return errors.Wrapf(err, "Waiting for soft-reboot")
1184+
}
1185+
plog.Debug("Soft-reboot complete")
11601186
}
1161-
plog.Debug("Reboot complete")
11621187
}
11631188
}
11641189

mantle/platform/machine/aws/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
8080
return platform.WaitForMachineReboot(am, am.journal, timeout, oldBootId)
8181
}
8282

83+
func (am *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
84+
return platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldUserspaceTimestamp)
85+
}
86+
8387
func (am *machine) Destroy() {
8488
origConsole, err := am.cluster.flight.api.GetConsoleOutput(am.ID())
8589
if err != nil {

mantle/platform/machine/azure/machine.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
108108
return am.refetchIPs()
109109
}
110110

111+
func (am *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
112+
err := platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldUserspaceTimestamp)
113+
if err != nil {
114+
return err
115+
}
116+
// For soft-reboot, IP addresses should not change, but let's refetch to be safe
117+
return am.refetchIPs()
118+
}
119+
111120
func (am *machine) Destroy() {
112121
if err := am.saveConsole(); err != nil {
113122
// log error, but do not fail to terminate instance

mantle/platform/machine/do/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ func (dm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7777
return platform.WaitForMachineReboot(dm, dm.journal, timeout, oldBootId)
7878
}
7979

80+
func (dm *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
81+
return platform.WaitForMachineSoftReboot(dm, dm.journal, timeout, oldUserspaceTimestamp)
82+
}
83+
8084
func (dm *machine) Destroy() {
8185
if err := dm.cluster.flight.api.DeleteDroplet(context.TODO(), dm.droplet.ID); err != nil {
8286
plog.Errorf("Error deleting droplet %v: %v", dm.droplet.ID, err)

mantle/platform/machine/esx/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ func (em *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7878
return platform.WaitForMachineReboot(em, em.journal, timeout, oldBootId)
7979
}
8080

81+
func (em *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
82+
return platform.WaitForMachineSoftReboot(em, em.journal, timeout, oldUserspaceTimestamp)
83+
}
84+
8185
func (em *machine) Destroy() {
8286
if err := em.cluster.flight.api.TerminateDevice(em.ID()); err != nil {
8387
plog.Errorf("Error terminating device %v: %v", em.ID(), err)

mantle/platform/machine/gcloud/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ func (gm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7878
return platform.WaitForMachineReboot(gm, gm.journal, timeout, oldBootId)
7979
}
8080

81+
func (gm *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
82+
return platform.WaitForMachineSoftReboot(gm, gm.journal, timeout, oldUserspaceTimestamp)
83+
}
84+
8185
func (gm *machine) Destroy() {
8286
if err := gm.saveConsole(); err != nil {
8387
plog.Errorf("Error saving console for instance %v: %v", gm.ID(), err)

mantle/platform/machine/openstack/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ func (om *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
9999
return platform.WaitForMachineReboot(om, om.journal, timeout, oldBootId)
100100
}
101101

102+
func (om *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
103+
return platform.WaitForMachineSoftReboot(om, om.journal, timeout, oldUserspaceTimestamp)
104+
}
105+
102106
func (om *machine) Destroy() {
103107
if err := om.saveConsole(); err != nil {
104108
plog.Errorf("Error saving console for instance %v: %v", om.ID(), err)

0 commit comments

Comments
 (0)