Skip to content

Commit b852382

Browse files
roachprod: improve snapshot support for AWS and GCE (#170519)
roachprod: improve snapshot support for AWS and GCE
2 parents 4e7fa94 + 3066efc commit b852382

5 files changed

Lines changed: 108 additions & 29 deletions

File tree

pkg/cmd/roachprod/cli/commands.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2130,7 +2130,7 @@ func buildSnapshotListCmd() *cobra.Command {
21302130
return err
21312131
}
21322132
for _, snapshot := range snapshots {
2133-
config.Logger.Printf("found snapshot %s (id: %s)", snapshot.Name, snapshot.ID)
2133+
config.Logger.Printf("found snapshot %s (id: %s, status: %s)", snapshot.Name, snapshot.ID, snapshot.Status)
21342134
}
21352135
return nil
21362136
}),

pkg/roachprod/roachprod.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package roachprod
77

88
import (
99
"bytes"
10+
"cmp"
1011
"context"
1112
"encoding/json"
1213
"fmt"
@@ -26,6 +27,7 @@ import (
2627
"strconv"
2728
"strings"
2829
"sync"
30+
"sync/atomic"
2931
"time"
3032

3133
"github.com/cockroachdb/cockroach/pkg/build"
@@ -2415,14 +2417,10 @@ func CreateSnapshot(
24152417
// probably the predecessor one. Also ensure that any running CRDB processes
24162418
// have been stopped since we're taking raw disk snapshots cluster-wide.
24172419

2418-
// Count total volumes across all nodes for the progress spinner.
2419-
totalVolumes := 0
2420-
for _, node := range nodes {
2421-
totalVolumes += len(c.VMs[node-1].NonBootAttachedVolumes)
2422-
}
2423-
spinner := ui.NewDefaultCountingSpinner(l, "creating snapshots", totalVolumes)
2420+
spinner := ui.NewDefaultCountingSpinner(l, "creating snapshots", len(nodes))
24242421
defer spinner.Start()()
24252422

2423+
var nodesSnapped atomic.Int32
24262424
volumesSnapshotMu := struct {
24272425
syncutil.Mutex
24282426
snapshots []vm.VolumeSnapshot
@@ -2488,14 +2486,14 @@ func CreateSnapshot(
24882486
}
24892487
volumesSnapshotMu.Lock()
24902488
volumesSnapshotMu.snapshots = append(volumesSnapshotMu.snapshots, volumeSnapshot)
2491-
spinner.CountStatus(len(volumesSnapshotMu.snapshots))
24922489
volumesSnapshotMu.Unlock()
24932490
return nil
24942491
})
24952492
}
24962493
if err := g.Wait(); err != nil {
24972494
return err
24982495
}
2496+
spinner.CountStatus(int(nodesSnapped.Add(1)))
24992497
return nil
25002498
}); err != nil {
25012499
res.Err = err
@@ -2649,6 +2647,15 @@ func ApplySnapshots(
26492647
return err
26502648
}
26512649

2650+
// Sort by storeIdx so volumes are attached in order.
2651+
// AttachVolume assigns device names sequentially
2652+
// (/dev/sdd, /dev/sde, ...) based on attachment order,
2653+
// so attaching in storeIdx order ensures the device-to-
2654+
// store mapping is consistent with the original cluster.
2655+
slices.SortFunc(createdVolumes, func(a, b createdVolume) int {
2656+
return cmp.Compare(a.storeIdx, b.storeIdx)
2657+
})
2658+
26522659
// Attach and mount volumes sequentially since GCE
26532660
// does not support concurrent instance modifications.
26542661
for _, cv := range createdVolumes {
@@ -2665,7 +2672,7 @@ func ApplySnapshots(
26652672
l.Printf(buf.String())
26662673
return err
26672674
}
2668-
l.Printf("mounted %s at %s on %s", cv.volume.ProviderResourceID, mountDir, cVM.ProviderID)
2675+
l.Printf("mounted %s at %s on %s", cv.volume.ProviderResourceID, mountDir, cVM.Name)
26692676
}
26702677
// Save the cluster cache once after all volumes are
26712678
// attached and mounted for this node.

pkg/roachprod/vm/aws/aws.go

Lines changed: 70 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2658,6 +2658,7 @@ func (p *Provider) CreateVolume(
26582658
args = append(args, "--volume-type", vco.Type)
26592659
case "":
26602660
// Use the default.
2661+
args = append(args, "--volume-type", defaultEBSVolumeType)
26612662
default:
26622663
return vol, errors.Newf("Invalid volume type %q", vco.Type)
26632664
}
@@ -2754,8 +2755,73 @@ func (p *Provider) DeleteVolume(l *logger.Logger, volume vm.Volume, v *vm.VM) er
27542755
return nil
27552756
}
27562757

2757-
func (p *Provider) ListVolumes(l *logger.Logger, vm *vm.VM) ([]vm.Volume, error) {
2758-
return vm.NonBootAttachedVolumes, nil
2758+
func (p *Provider) ListVolumes(l *logger.Logger, v *vm.VM) ([]vm.Volume, error) {
2759+
if v.ProviderID == "" {
2760+
return nil, nil
2761+
}
2762+
region := v.Zone[:len(v.Zone)-1]
2763+
2764+
// Describe this instance to get block device mappings and the root device
2765+
// name, which we need to identify (and exclude) the boot volume.
2766+
var descResp DescribeInstancesOutput
2767+
descArgs := []string{
2768+
"ec2", "describe-instances",
2769+
"--region", region,
2770+
"--instance-ids", v.ProviderID,
2771+
}
2772+
if err := p.runJSONCommand(l, descArgs, &descResp); err != nil {
2773+
return nil, err
2774+
}
2775+
var instance *DescribeInstancesOutputInstance
2776+
for _, res := range descResp.Reservations {
2777+
for i := range res.Instances {
2778+
if res.Instances[i].InstanceID == v.ProviderID {
2779+
instance = &res.Instances[i]
2780+
break
2781+
}
2782+
}
2783+
}
2784+
if instance == nil {
2785+
l.Printf("WARNING: instance %s not found in describe-instances response for region %s", v.ProviderID, region)
2786+
return nil, nil
2787+
}
2788+
2789+
// Collect non-boot volume device names from the block device
2790+
// mappings. The root device is excluded so we only return data volumes.
2791+
deviceByVolumeID := make(map[string]string)
2792+
for _, bdm := range instance.BlockDeviceMappings {
2793+
if bdm.DeviceName != instance.RootDeviceName {
2794+
deviceByVolumeID[bdm.Disk.VolumeID] = bdm.DeviceName
2795+
}
2796+
}
2797+
if len(deviceByVolumeID) == 0 {
2798+
return nil, nil
2799+
}
2800+
2801+
// Describe the non-boot volumes to get their full metadata.
2802+
volsByInstance, err := p.getVolumesForInstances(
2803+
context.Background(), l, region, []string{v.ProviderID},
2804+
)
2805+
if err != nil {
2806+
return nil, err
2807+
}
2808+
2809+
var volumes []vm.Volume
2810+
for _, vol := range volsByInstance[v.ProviderID] {
2811+
if _, ok := deviceByVolumeID[vol.ProviderResourceID]; ok {
2812+
volumes = append(volumes, vol)
2813+
}
2814+
}
2815+
2816+
// Sort by device name to ensure deterministic ordering that matches
2817+
// the mount point assignment (/mnt/data1, /mnt/data2, etc.).
2818+
slices.SortFunc(volumes, func(a, b vm.Volume) int {
2819+
return strings.Compare(
2820+
deviceByVolumeID[a.ProviderResourceID],
2821+
deviceByVolumeID[b.ProviderResourceID],
2822+
)
2823+
})
2824+
return volumes, nil
27592825
}
27602826

27612827
type snapshotOutput struct {
@@ -2797,21 +2863,11 @@ func (p *Provider) CreateVolumeSnapshot(
27972863
return vm.VolumeSnapshot{}, err
27982864
}
27992865

2800-
// Wait for the snapshot to complete before returning. AWS snapshots
2801-
// are asynchronous and cannot be used to create volumes while pending.
2802-
waitArgs := []string{
2803-
"ec2", "wait", "snapshot-completed",
2804-
"--region", region,
2805-
"--snapshot-ids", so.SnapshotID,
2806-
}
2807-
if _, err := p.runCommand(l, waitArgs); err != nil {
2808-
return vm.VolumeSnapshot{}, errors.Wrapf(err, "waiting for snapshot %s to complete", so.SnapshotID)
2809-
}
2810-
28112866
return vm.VolumeSnapshot{
28122867
ID: so.SnapshotID,
28132868
Name: vsco.Name,
28142869
Region: region,
2870+
Status: so.State,
28152871
}, nil
28162872
}
28172873

@@ -2875,6 +2931,7 @@ func (p *Provider) ListVolumeSnapshots(
28752931
ID: so.SnapshotID,
28762932
Name: name,
28772933
Region: r,
2934+
Status: so.State,
28782935
})
28792936
}
28802937
mu.Lock()

pkg/roachprod/vm/gce/gcloud.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,9 @@ func (p *Provider) CreateVolumeSnapshot(
743743
return vm.VolumeSnapshot{}, err
744744
}
745745
return vm.VolumeSnapshot{
746-
ID: createJsonResponse.ID,
747-
Name: createJsonResponse.Name,
746+
ID: createJsonResponse.ID,
747+
Name: createJsonResponse.Name,
748+
Status: createJsonResponse.Status,
748749
}, nil
749750
}
750751

@@ -756,7 +757,7 @@ func (p *Provider) ListVolumeSnapshots(
756757
"--project", p.GetProject(),
757758
"snapshots",
758759
"list",
759-
"--format", "json(name,id)",
760+
"--format", "json(name,id,status)",
760761
}
761762
var filters []string
762763
// Only list snapshots that are fully created. Without this filter,
@@ -787,8 +788,9 @@ func (p *Provider) ListVolumeSnapshots(
787788
continue
788789
}
789790
snapshots = append(snapshots, vm.VolumeSnapshot{
790-
ID: snapshotJson.ID,
791-
Name: snapshotJson.Name,
791+
ID: snapshotJson.ID,
792+
Name: snapshotJson.Name,
793+
Status: snapshotJson.Status,
792794
})
793795
}
794796
sort.Sort(vm.VolumeSnapshots(snapshots))

pkg/roachprod/vm/vm.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,17 @@ type VolumeSnapshot struct {
456456
// Region is set and used by the AWS provider to scope snapshot
457457
// operations to the correct region. Other providers may leave it empty.
458458
Region string
459+
// Status is the current state of the snapshot. The value is
460+
// provider-specific (e.g. "completed" on AWS, "READY" on GCE).
461+
// An empty string means the state is unknown.
462+
Status string
463+
}
464+
465+
// IsReady returns true when the snapshot has finished being created and
466+
// is usable. See VolumeSnapshot.Status for provider-specific values.
467+
func (v VolumeSnapshot) IsReady() bool {
468+
return v.Status == "READY" || // GCE
469+
v.Status == "completed" // AWS
459470
}
460471

461472
type VolumeSnapshots []VolumeSnapshot
@@ -616,8 +627,10 @@ type Provider interface {
616627
DeleteVolume(l *logger.Logger, volume Volume, vm *VM) error
617628
// AttachVolume attaches the given volume to the given VM.
618629
AttachVolume(l *logger.Logger, volume Volume, vm *VM) (string, error)
619-
// CreateVolumeSnapshot creates a snapshot of the given volume, using the
620-
// given options.
630+
// CreateVolumeSnapshot creates a snapshot of the given volume. Some
631+
// providers may return before the snapshot is fully ready. Callers
632+
// that need a completed snapshot should poll ListVolumeSnapshots
633+
// and check the Status field.
621634
CreateVolumeSnapshot(l *logger.Logger, volume Volume, vsco VolumeSnapshotCreateOpts) (VolumeSnapshot, error)
622635
// ListVolumeSnapshots lists the individual volume snapshots that satisfy
623636
// the search criteria.

0 commit comments

Comments
 (0)