@@ -23,6 +23,17 @@ import (
2323
2424var cockroachIOController = filepath .Join ("/sys/fs/cgroup/system.slice" , install .VirtualClusterLabel (install .SystemInterfaceName , 0 )+ ".service" , "io.max" )
2525
26+ // diskMajMinExpr is a shell command substitution that expands to the
27+ // "MAJ:MIN" pair of the block device backing /mnt/data1 on the current node.
28+ // It is evaluated on each node so that the device is resolved locally, since
29+ // GCE does not guarantee consistent /dev/sdX letter assignment across VMs
30+ // (see #170379). All cgroup-disk-staller commands that need the device's
31+ // major:minor go through this single expression.
32+ // Double quotes (not single) around [:space:] keep this expression usable
33+ // inside an outer `bash -c '...'` invocation (setThroughputCmd) without
34+ // terminating the surrounding single-quoted string.
35+ const diskMajMinExpr = `$(lsblk -dn -o MAJ:MIN "$(findmnt -n -o SOURCE /mnt/data1)" | tr -d "[:space:]")`
36+
2637const CgroupsDiskStallName = "cgroup-disk-stall"
2738
2839type CGroupDiskStaller struct {
@@ -352,11 +363,6 @@ func (s *CGroupDiskStaller) setThroughputCmd(
352363 bw throughput ,
353364 cockroachIOController string ,
354365) (string , error ) {
355- maj , min , err := s .DiskDeviceMajorMinor (ctx , l )
356- if err != nil {
357- return "" , err
358- }
359-
360366 var limits []string
361367 for _ , rw := range readOrWrite {
362368 bytesPerSecondStr := "max"
@@ -367,9 +373,9 @@ func (s *CGroupDiskStaller) setThroughputCmd(
367373 }
368374 l .Printf ("setting cgroup bandwith limits:\n %v" , limits )
369375
370- return fmt .Sprintf ("sudo /bin/bash -c 'echo %d:%d %s > %s'" ,
371- maj ,
372- min ,
376+ return fmt .Sprintf (
377+ `sudo /bin/bash -c 'echo "%s %s" > %s'` ,
378+ diskMajMinExpr ,
373379 strings .Join (limits , " " ),
374380 cockroachIOController ,
375381 ), nil
@@ -380,14 +386,9 @@ func (s *CGroupDiskStaller) setThroughputCmd(
380386func (s * CGroupDiskStaller ) GetReadWriteBytes (
381387 ctx context.Context , l * logger.Logger , node install.Nodes ,
382388) (int , int , error ) {
383- maj , min , err := s .DiskDeviceMajorMinor (ctx , l )
384- if err != nil {
385- return 0 , 0 , err
386- }
387- // Check the number of bytes read and written to disk.
388389 res , err := s .RunWithDetails (
389390 ctx , l , node ,
390- fmt .Sprintf (`grep -E '%d:%d' /sys/fs/cgroup/system.slice/io.stat |` , maj , min ),
391+ fmt .Sprintf (`grep -E "^%s" /sys/fs/cgroup/system.slice/io.stat |` , diskMajMinExpr ),
391392 `grep -oE 'rbytes=[0-9]+|wbytes=[0-9]+' |` ,
392393 `awk -F= '{printf "%s ", $2} END {print ""}'` ,
393394 )
@@ -415,9 +416,13 @@ const (
415416 DmsetupDiskStallName = "dmsetup-disk-stall"
416417 dmsetupStallCmd = "sudo dmsetup suspend --noflush --nolockfs data1"
417418 dmsetupUnstallCmd = "sudo dmsetup resume data1"
418- // dmsetupStateFile stores the original disk device name so cleanup can find it
419- // even when running as a separate stage after setup has modified the mount.
419+ // dmsetupStateFile stores the original disk device name (per node) so
420+ // cleanup can find it even when running as a separate stage after setup
421+ // has unmounted /mnt/data1 and remounted /dev/mapper/data1 over it.
420422 dmsetupStateFile = "/tmp/dmsetup-disk-stall-device"
423+ // dmsetupDevExpr is a shell expression that expands to the data disk
424+ // device name stored in dmsetupStateFile on the current node.
425+ dmsetupDevExpr = `"$(cat ` + dmsetupStateFile + `)"`
421426)
422427
423428type DmsetupDiskStaller struct {
@@ -446,36 +451,23 @@ func (s *DmsetupDiskStaller) Description() string {
446451 return "dmsetup disk staller"
447452}
448453
449- // getStoredDeviceName reads the device name from the state file saved during setup.
450- // This allows cleanup to find the original device even when running as a separate stage.
451- func (s * DmsetupDiskStaller ) getStoredDeviceName (
452- ctx context.Context , l * logger.Logger ,
453- ) (string , error ) {
454- res , err := s .RunWithDetails (ctx , l , s .c .Nodes [:1 ], fmt .Sprintf ("cat %s 2>/dev/null" , dmsetupStateFile ))
455- if err != nil {
456- return "" , errors .Wrap (err , "failed to read stored device name" )
457- }
458- dev := strings .TrimSpace (res .Stdout )
459- if dev == "" {
460- return "" , errors .New ("no stored device name found; run setup stage first" )
461- }
462- return dev , nil
463- }
464-
465- // getOrStoreDeviceName gets the disk device name, storing it to the state file if not already stored.
466- // This is used during setup to persist the device name for later stages.
467- func (s * DmsetupDiskStaller ) getOrStoreDeviceName (
468- ctx context.Context , l * logger.Logger ,
469- ) (string , error ) {
470- dev , err := s .DiskDeviceName (ctx , l )
471- if err != nil {
472- return "" , err
473- }
474- // Save the device name for later use by cleanup when running stages independently.
475- if err = s .Run (ctx , l , s .c .Nodes , fmt .Sprintf ("echo '%s' > %s" , dev , dmsetupStateFile )); err != nil {
476- return "" , errors .Wrap (err , "failed to save device name to state file" )
454+ // storeDeviceName writes the device backing /mnt/data1 (per node, via
455+ // findmnt) to dmsetupStateFile so subsequent commands can read it back via
456+ // dmsetupDevExpr. Doing this per node is required because GCE does not
457+ // guarantee consistent /dev/sdX letter assignment across VMs (see #170379).
458+ //
459+ // If overwrite is true, the state file is always rewritten (used by Setup).
460+ // If false, the file is only written when missing (used by Cleanup, since
461+ // after Setup has run findmnt would return /dev/mapper/data1 rather than the
462+ // original device).
463+ func (s * DmsetupDiskStaller ) storeDeviceName (
464+ ctx context.Context , l * logger.Logger , overwrite bool ,
465+ ) error {
466+ cmd := fmt .Sprintf (`findmnt -n -o SOURCE /mnt/data1 > %s` , dmsetupStateFile )
467+ if ! overwrite {
468+ cmd = fmt .Sprintf (`[ -s %s ] || ` , dmsetupStateFile ) + cmd
477469 }
478- return dev , nil
470+ return s . Run ( ctx , l , s . c . Nodes , cmd )
479471}
480472
481473func (s * DmsetupDiskStaller ) Setup (ctx context.Context , l * logger.Logger , args FailureArgs ) error {
@@ -492,10 +484,11 @@ func (s *DmsetupDiskStaller) Setup(ctx context.Context, l *logger.Logger, args F
492484 }
493485 }
494486
495- // Get the device name and store it for later stages (cleanup).
496- dev , err := s .getOrStoreDeviceName (ctx , l )
497- if err != nil {
498- return err
487+ // Resolve the data disk device on each node and persist it for later
488+ // stages (cleanup may run independently after the mount has been
489+ // replaced).
490+ if err = s .storeDeviceName (ctx , l , true /* overwrite */ ); err != nil {
491+ return errors .Wrap (err , "resolving data disk device" )
499492 }
500493
501494 // snapd will run "snapd auto-import /dev/dm-0" via udev triggers when
@@ -515,11 +508,12 @@ sudo udevadm settle`); err != nil {
515508 return err
516509 }
517510 // See https://github.com/cockroachdb/cockroach/issues/129619#issuecomment-2316147244.
518- if err = s .Run (ctx , l , s .c .Nodes , `sudo tune2fs -O ^has_journal ` + dev ); err != nil {
511+ if err = s .Run (ctx , l , s .c .Nodes , `sudo tune2fs -O ^has_journal ` + dmsetupDevExpr ); err != nil {
519512 return errors .WithHintf (err , "disabling journaling fails if the cluster has been started" )
520513 }
521- if err = s .Run (ctx , l , s .c .Nodes , `echo "0 $(sudo blockdev --getsz ` + dev + `) linear ` + dev + ` 0" | ` +
522- `sudo dmsetup create data1` ); err != nil {
514+ if err = s .Run (ctx , l , s .c .Nodes ,
515+ `dev=` + dmsetupDevExpr + `; echo "0 $(sudo blockdev --getsz "$dev") linear $dev 0" | ` +
516+ `sudo dmsetup create data1` ); err != nil {
523517 return err
524518 }
525519 // This has occasionally been seen to fail with "Device or resource busy",
@@ -605,15 +599,12 @@ func (s *DmsetupDiskStaller) Cleanup(
605599 }
606600 }
607601
608- // Try to get the device name from the state file first (for independent stage runs),
609- // fall back to live lookup (for full lifecycle runs where mount is still intact).
610- dev , err := s .getStoredDeviceName (ctx , l )
611- if err != nil {
612- l .Printf ("Could not read stored device name, trying live lookup: %v" , err )
613- dev , err = s .DiskDeviceName (ctx , l )
614- if err != nil {
615- return errors .Wrap (err , "failed to determine disk device" )
616- }
602+ // Ensure each node has its device name persisted in dmsetupStateFile.
603+ // In the common case Setup already wrote it; if cleanup is run standalone
604+ // it is populated here via findmnt while the original mount is still in
605+ // place.
606+ if err := s .storeDeviceName (ctx , l , false /* overwrite */ ); err != nil {
607+ return errors .Wrap (err , "determining disk device" )
617608 }
618609
619610 if err := s .Run (ctx , l , s .c .Nodes , `sudo dmsetup resume data1` ); err != nil {
@@ -625,11 +616,11 @@ func (s *DmsetupDiskStaller) Cleanup(
625616 if err := s .Run (ctx , l , s .c .Nodes , `sudo dmsetup remove data1` ); err != nil {
626617 return err
627618 }
628- if err := s .Run (ctx , l , s .c .Nodes , `sudo tune2fs -O has_journal ` + dev ); err != nil {
619+ if err := s .Run (ctx , l , s .c .Nodes , `sudo tune2fs -O has_journal ` + dmsetupDevExpr ); err != nil {
629620 return err
630621 }
631622 // Mount the original device back to /mnt/data1.
632- if err := s .Run (ctx , l , s .c .Nodes , fmt . Sprintf ( " sudo mount %s /mnt/data1" , dev ) ); err != nil {
623+ if err := s .Run (ctx , l , s .c .Nodes , ` sudo mount ` + dmsetupDevExpr + ` /mnt/data1` ); err != nil {
633624 return err
634625 }
635626 // Reenable snapd autoimport udev rules.
0 commit comments