Skip to content

Commit 3066efc

Browse files
roachprod: improve snapshot support for AWS and GCE
- AWS ListVolumes now queries the EC2 API instead of using cached VM metadata, so volumes are discovered accurately after attach/detach. - CreateVolumeSnapshot no longer blocks waiting for the snapshot to complete on AWS, since the cluster is stopped and the point-in-time capture is immediate. - Add VolumeSnapshot.Status field and IsReady() helper so callers can check snapshot readiness across providers. - Spinner tracks per-node progress instead of per-volume. - Sort volumes by storeIdx before attaching to preserve device ordering. Epic: none Release note: None Co-Authored-By: roachdev-claude <roachdev-claude-bot@cockroachlabs.com>
1 parent 542a5b0 commit 3066efc

5 files changed

Lines changed: 108 additions & 29 deletions

File tree

pkg/cmd/roachprod/cli/commands.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2130,7 +2130,7 @@ func buildSnapshotListCmd() *cobra.Command {
21302130
return err
21312131
}
21322132
for _, snapshot := range snapshots {
2133-
config.Logger.Printf("found snapshot %s (id: %s)", snapshot.Name, snapshot.ID)
2133+
config.Logger.Printf("found snapshot %s (id: %s, status: %s)", snapshot.Name, snapshot.ID, snapshot.Status)
21342134
}
21352135
return nil
21362136
}),

pkg/roachprod/roachprod.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package roachprod
77

88
import (
99
"bytes"
10+
"cmp"
1011
"context"
1112
"encoding/json"
1213
"fmt"
@@ -26,6 +27,7 @@ import (
2627
"strconv"
2728
"strings"
2829
"sync"
30+
"sync/atomic"
2931
"time"
3032

3133
"github.com/cockroachdb/cockroach/pkg/build"
@@ -2412,14 +2414,10 @@ func CreateSnapshot(
24122414
// probably the predecessor one. Also ensure that any running CRDB processes
24132415
// have been stopped since we're taking raw disk snapshots cluster-wide.
24142416

2415-
// Count total volumes across all nodes for the progress spinner.
2416-
totalVolumes := 0
2417-
for _, node := range nodes {
2418-
totalVolumes += len(c.VMs[node-1].NonBootAttachedVolumes)
2419-
}
2420-
spinner := ui.NewDefaultCountingSpinner(l, "creating snapshots", totalVolumes)
2417+
spinner := ui.NewDefaultCountingSpinner(l, "creating snapshots", len(nodes))
24212418
defer spinner.Start()()
24222419

2420+
var nodesSnapped atomic.Int32
24232421
volumesSnapshotMu := struct {
24242422
syncutil.Mutex
24252423
snapshots []vm.VolumeSnapshot
@@ -2485,14 +2483,14 @@ func CreateSnapshot(
24852483
}
24862484
volumesSnapshotMu.Lock()
24872485
volumesSnapshotMu.snapshots = append(volumesSnapshotMu.snapshots, volumeSnapshot)
2488-
spinner.CountStatus(len(volumesSnapshotMu.snapshots))
24892486
volumesSnapshotMu.Unlock()
24902487
return nil
24912488
})
24922489
}
24932490
if err := g.Wait(); err != nil {
24942491
return err
24952492
}
2493+
spinner.CountStatus(int(nodesSnapped.Add(1)))
24962494
return nil
24972495
}); err != nil {
24982496
res.Err = err
@@ -2646,6 +2644,15 @@ func ApplySnapshots(
26462644
return err
26472645
}
26482646

2647+
// Sort by storeIdx so volumes are attached in order.
2648+
// AttachVolume assigns device names sequentially
2649+
// (/dev/sdd, /dev/sde, ...) based on attachment order,
2650+
// so attaching in storeIdx order ensures the device-to-
2651+
// store mapping is consistent with the original cluster.
2652+
slices.SortFunc(createdVolumes, func(a, b createdVolume) int {
2653+
return cmp.Compare(a.storeIdx, b.storeIdx)
2654+
})
2655+
26492656
// Attach and mount volumes sequentially since GCE
26502657
// does not support concurrent instance modifications.
26512658
for _, cv := range createdVolumes {
@@ -2662,7 +2669,7 @@ func ApplySnapshots(
26622669
l.Printf(buf.String())
26632670
return err
26642671
}
2665-
l.Printf("mounted %s at %s on %s", cv.volume.ProviderResourceID, mountDir, cVM.ProviderID)
2672+
l.Printf("mounted %s at %s on %s", cv.volume.ProviderResourceID, mountDir, cVM.Name)
26662673
}
26672674
// Save the cluster cache once after all volumes are
26682675
// attached and mounted for this node.

pkg/roachprod/vm/aws/aws.go

Lines changed: 70 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2622,6 +2622,7 @@ func (p *Provider) CreateVolume(
26222622
args = append(args, "--volume-type", vco.Type)
26232623
case "":
26242624
// Use the default.
2625+
args = append(args, "--volume-type", defaultEBSVolumeType)
26252626
default:
26262627
return vol, errors.Newf("Invalid volume type %q", vco.Type)
26272628
}
@@ -2718,8 +2719,73 @@ func (p *Provider) DeleteVolume(l *logger.Logger, volume vm.Volume, v *vm.VM) er
27182719
return nil
27192720
}
27202721

2721-
func (p *Provider) ListVolumes(l *logger.Logger, vm *vm.VM) ([]vm.Volume, error) {
2722-
return vm.NonBootAttachedVolumes, nil
2722+
func (p *Provider) ListVolumes(l *logger.Logger, v *vm.VM) ([]vm.Volume, error) {
2723+
if v.ProviderID == "" {
2724+
return nil, nil
2725+
}
2726+
region := v.Zone[:len(v.Zone)-1]
2727+
2728+
// Describe this instance to get block device mappings and the root device
2729+
// name, which we need to identify (and exclude) the boot volume.
2730+
var descResp DescribeInstancesOutput
2731+
descArgs := []string{
2732+
"ec2", "describe-instances",
2733+
"--region", region,
2734+
"--instance-ids", v.ProviderID,
2735+
}
2736+
if err := p.runJSONCommand(l, descArgs, &descResp); err != nil {
2737+
return nil, err
2738+
}
2739+
var instance *DescribeInstancesOutputInstance
2740+
for _, res := range descResp.Reservations {
2741+
for i := range res.Instances {
2742+
if res.Instances[i].InstanceID == v.ProviderID {
2743+
instance = &res.Instances[i]
2744+
break
2745+
}
2746+
}
2747+
}
2748+
if instance == nil {
2749+
l.Printf("WARNING: instance %s not found in describe-instances response for region %s", v.ProviderID, region)
2750+
return nil, nil
2751+
}
2752+
2753+
// Collect non-boot volume device names from the block device
2754+
// mappings. The root device is excluded so we only return data volumes.
2755+
deviceByVolumeID := make(map[string]string)
2756+
for _, bdm := range instance.BlockDeviceMappings {
2757+
if bdm.DeviceName != instance.RootDeviceName {
2758+
deviceByVolumeID[bdm.Disk.VolumeID] = bdm.DeviceName
2759+
}
2760+
}
2761+
if len(deviceByVolumeID) == 0 {
2762+
return nil, nil
2763+
}
2764+
2765+
// Describe the non-boot volumes to get their full metadata.
2766+
volsByInstance, err := p.getVolumesForInstances(
2767+
context.Background(), l, region, []string{v.ProviderID},
2768+
)
2769+
if err != nil {
2770+
return nil, err
2771+
}
2772+
2773+
var volumes []vm.Volume
2774+
for _, vol := range volsByInstance[v.ProviderID] {
2775+
if _, ok := deviceByVolumeID[vol.ProviderResourceID]; ok {
2776+
volumes = append(volumes, vol)
2777+
}
2778+
}
2779+
2780+
// Sort by device name to ensure deterministic ordering that matches
2781+
// the mount point assignment (/mnt/data1, /mnt/data2, etc.).
2782+
slices.SortFunc(volumes, func(a, b vm.Volume) int {
2783+
return strings.Compare(
2784+
deviceByVolumeID[a.ProviderResourceID],
2785+
deviceByVolumeID[b.ProviderResourceID],
2786+
)
2787+
})
2788+
return volumes, nil
27232789
}
27242790

27252791
type snapshotOutput struct {
@@ -2761,21 +2827,11 @@ func (p *Provider) CreateVolumeSnapshot(
27612827
return vm.VolumeSnapshot{}, err
27622828
}
27632829

2764-
// Wait for the snapshot to complete before returning. AWS snapshots
2765-
// are asynchronous and cannot be used to create volumes while pending.
2766-
waitArgs := []string{
2767-
"ec2", "wait", "snapshot-completed",
2768-
"--region", region,
2769-
"--snapshot-ids", so.SnapshotID,
2770-
}
2771-
if _, err := p.runCommand(l, waitArgs); err != nil {
2772-
return vm.VolumeSnapshot{}, errors.Wrapf(err, "waiting for snapshot %s to complete", so.SnapshotID)
2773-
}
2774-
27752830
return vm.VolumeSnapshot{
27762831
ID: so.SnapshotID,
27772832
Name: vsco.Name,
27782833
Region: region,
2834+
Status: so.State,
27792835
}, nil
27802836
}
27812837

@@ -2839,6 +2895,7 @@ func (p *Provider) ListVolumeSnapshots(
28392895
ID: so.SnapshotID,
28402896
Name: name,
28412897
Region: r,
2898+
Status: so.State,
28422899
})
28432900
}
28442901
mu.Lock()

pkg/roachprod/vm/gce/gcloud.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -742,8 +742,9 @@ func (p *Provider) CreateVolumeSnapshot(
742742
return vm.VolumeSnapshot{}, err
743743
}
744744
return vm.VolumeSnapshot{
745-
ID: createJsonResponse.ID,
746-
Name: createJsonResponse.Name,
745+
ID: createJsonResponse.ID,
746+
Name: createJsonResponse.Name,
747+
Status: createJsonResponse.Status,
747748
}, nil
748749
}
749750

@@ -755,7 +756,7 @@ func (p *Provider) ListVolumeSnapshots(
755756
"--project", p.GetProject(),
756757
"snapshots",
757758
"list",
758-
"--format", "json(name,id)",
759+
"--format", "json(name,id,status)",
759760
}
760761
var filters []string
761762
// Only list snapshots that are fully created. Without this filter,
@@ -786,8 +787,9 @@ func (p *Provider) ListVolumeSnapshots(
786787
continue
787788
}
788789
snapshots = append(snapshots, vm.VolumeSnapshot{
789-
ID: snapshotJson.ID,
790-
Name: snapshotJson.Name,
790+
ID: snapshotJson.ID,
791+
Name: snapshotJson.Name,
792+
Status: snapshotJson.Status,
791793
})
792794
}
793795
sort.Sort(vm.VolumeSnapshots(snapshots))

pkg/roachprod/vm/vm.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,17 @@ type VolumeSnapshot struct {
456456
// Region is set and used by the AWS provider to scope snapshot
457457
// operations to the correct region. Other providers may leave it empty.
458458
Region string
459+
// Status is the current state of the snapshot. The value is
460+
// provider-specific (e.g. "completed" on AWS, "READY" on GCE).
461+
// An empty string means the state is unknown.
462+
Status string
463+
}
464+
465+
// IsReady returns true when the snapshot has finished being created and
466+
// is usable. See VolumeSnapshot.Status for provider-specific values.
467+
func (v VolumeSnapshot) IsReady() bool {
468+
return v.Status == "READY" || // GCE
469+
v.Status == "completed" // AWS
459470
}
460471

461472
type VolumeSnapshots []VolumeSnapshot
@@ -616,8 +627,10 @@ type Provider interface {
616627
DeleteVolume(l *logger.Logger, volume Volume, vm *VM) error
617628
// AttachVolume attaches the given volume to the given VM.
618629
AttachVolume(l *logger.Logger, volume Volume, vm *VM) (string, error)
619-
// CreateVolumeSnapshot creates a snapshot of the given volume, using the
620-
// given options.
630+
// CreateVolumeSnapshot creates a snapshot of the given volume. Some
631+
// providers may return before the snapshot is fully ready. Callers
632+
// that need a completed snapshot should poll ListVolumeSnapshots
633+
// and check the Status field.
621634
CreateVolumeSnapshot(l *logger.Logger, volume Volume, vsco VolumeSnapshotCreateOpts) (VolumeSnapshot, error)
622635
// ListVolumeSnapshots lists the individual volume snapshots that satisfy
623636
// the search criteria.

0 commit comments

Comments
 (0)