Skip to content

Commit 27dcc20

Browse files
authored
HDDS-14114. [DiskBalancer] Fix DiskBalancer tmp directory to be under hdds/<cluster-id>/tmp (#9515)
1 parent 216cd28 commit 27dcc20

2 files changed

Lines changed: 205 additions & 15 deletions

File tree

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.hadoop.ozone.container.diskbalancer;
1919

2020
import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_ALREADY_EXISTS;
21+
import static org.apache.hadoop.ozone.container.common.volume.StorageVolume.TMP_DIR_NAME;
2122
import static org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.calculateVolumeDataDensity;
2223
import static org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.getIdealUsage;
2324
import static org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.getVolumeUsages;
@@ -66,7 +67,6 @@
6667
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
6768
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
6869
import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet;
69-
import org.apache.hadoop.ozone.container.common.volume.StorageVolume;
7070
import org.apache.hadoop.ozone.container.common.volume.VolumeChoosingPolicyFactory;
7171
import org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.VolumeFixedUsage;
7272
import org.apache.hadoop.ozone.container.diskbalancer.policy.ContainerChoosingPolicy;
@@ -165,8 +165,6 @@ public DiskBalancerService(OzoneContainer ozoneContainer,
165165
metrics = DiskBalancerServiceMetrics.create();
166166

167167
loadDiskBalancerInfo();
168-
169-
constructTmpDir();
170168
}
171169

172170
/**
@@ -178,17 +176,45 @@ public synchronized void refresh(DiskBalancerInfo diskBalancerInfo) throws IOExc
178176
applyDiskBalancerInfo(diskBalancerInfo);
179177
}
180178

181-
private void constructTmpDir() throws IOException {
179+
/**
180+
* Cleans up stale diskBalancer temporary directories on startup.
181+
*
182+
* @throws IOException if cleanup fails
183+
*/
184+
private void cleanupTmpDir() throws IOException {
182185
for (HddsVolume volume:
183186
StorageVolumeUtil.getHddsVolumesList(volumeSet.getVolumesList())) {
184-
Path tmpDir = getDiskBalancerTmpDir(volume);
187+
Path diskBalancerTmpDir = null;
185188
try {
186-
FileUtils.deleteDirectory(tmpDir.toFile());
187-
FileUtils.forceMkdir(tmpDir.toFile());
189+
File tmpDir = volume.getTmpDir();
190+
if (tmpDir != null) {
191+
// If tmpDir is initialized, use it directly
192+
diskBalancerTmpDir = tmpDir.toPath().resolve(DISK_BALANCER_DIR);
193+
} else {
194+
// If tmpDir is not initialized, construct the path manually
195+
// This handles the case where stale directories exist from previous
196+
// failed moves even though volumes haven't been initialized yet
197+
String clusterId = volume.getClusterID();
198+
if (clusterId == null) {
199+
// Skip volumes without clusterID - they're not properly formatted
200+
continue;
201+
}
202+
String workDirName = volume.getWorkingDirName();
203+
if (workDirName == null) {
204+
workDirName = clusterId;
205+
}
206+
diskBalancerTmpDir = Paths.get(volume.getStorageDir().toString(),
207+
workDirName, TMP_DIR_NAME, DISK_BALANCER_DIR);
208+
}
209+
210+
// Clean up any existing diskBalancer directory from previous runs
211+
if (diskBalancerTmpDir.toFile().exists()) {
212+
FileUtils.deleteDirectory(diskBalancerTmpDir.toFile());
213+
LOG.info("Cleaned up stale diskBalancer tmp directory: {}", diskBalancerTmpDir);
214+
}
188215
} catch (IOException ex) {
189-
LOG.warn("Can not reconstruct tmp directory under volume {}", volume,
190-
ex);
191-
throw ex;
216+
LOG.warn("Failed to clean up diskBalancer tmp directory under volume {}: {}",
217+
volume, diskBalancerTmpDir, ex);
192218
}
193219
}
194220
}
@@ -341,6 +367,17 @@ public void setVersion(DiskBalancerVersion version) {
341367
this.version = version;
342368
}
343369

370+
@Override
371+
public synchronized void start() {
372+
// Clean up any stale diskBalancer tmp directories from previous runs
373+
try {
374+
cleanupTmpDir();
375+
} catch (IOException e) {
376+
LOG.warn("Failed to clean up diskBalancer tmp directories before starting service", e);
377+
}
378+
super.start();
379+
}
380+
344381
@Override
345382
public BackgroundTaskQueue getTasks() {
346383
BackgroundTaskQueue queue = new BackgroundTaskQueue();
@@ -477,8 +514,8 @@ public BackgroundTaskResult call() {
477514
container.readLock();
478515
try {
479516
// Step 1: Copy container to new Volume's tmp Dir
480-
diskBalancerTmpDir = destVolume.getTmpDir().toPath()
481-
.resolve(DISK_BALANCER_DIR).resolve(String.valueOf(containerId));
517+
diskBalancerTmpDir = getDiskBalancerTmpDir(destVolume)
518+
.resolve(String.valueOf(containerId));
482519
ozoneContainer.getController().copyContainer(containerData, diskBalancerTmpDir);
483520

484521
// Step 2: verify checksum and Transition Temp container to Temp C1-RECOVERING
@@ -697,9 +734,8 @@ public long calculateBytesToMove(List<VolumeFixedUsage> inputVolumeSet) {
697734
return totalBytesToMove;
698735
}
699736

700-
private Path getDiskBalancerTmpDir(HddsVolume hddsVolume) {
701-
return Paths.get(hddsVolume.getVolumeRootDir())
702-
.resolve(StorageVolume.TMP_DIR_NAME).resolve(DISK_BALANCER_DIR);
737+
private Path getDiskBalancerTmpDir(HddsVolume hddsVolume) throws IOException {
738+
return hddsVolume.getTmpDir().toPath().resolve(DISK_BALANCER_DIR);
703739
}
704740

705741
public DiskBalancerServiceMetrics getMetrics() {

hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerService.java

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
package org.apache.hadoop.ozone.container.diskbalancer;
1919

2020
import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.createDbInstancesForTestIfNeeded;
21+
import static org.apache.hadoop.ozone.container.common.volume.StorageVolume.TMP_DIR_NAME;
2122
import static org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.getVolumeUsages;
2223
import static org.junit.jupiter.api.Assertions.assertEquals;
2324
import static org.junit.jupiter.api.Assertions.assertFalse;
25+
import static org.junit.jupiter.api.Assertions.assertNull;
2426
import static org.junit.jupiter.api.Assertions.assertThrows;
2527
import static org.junit.jupiter.api.Assertions.assertTrue;
2628
import static org.mockito.ArgumentMatchers.any;
@@ -52,6 +54,7 @@
5254
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
5355
import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet;
5456
import org.apache.hadoop.ozone.container.common.volume.StorageVolume;
57+
import org.apache.hadoop.ozone.container.common.volume.VolumeSet;
5558
import org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.VolumeFixedUsage;
5659
import org.apache.hadoop.ozone.container.diskbalancer.policy.ContainerChoosingPolicy;
5760
import org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy;
@@ -114,6 +117,45 @@ public void cleanup() throws IOException {
114117
volumeSet.shutdown();
115118
}
116119

120+
/**
121+
* Creates stale diskBalancer directories to simulate leftover directories
122+
* from previous failed container moves.
123+
*
124+
* @param volumeSet the volume set containing volumes to create stale dirs for
125+
* @param clusterId the cluster ID to use when constructing paths for uninitialized volumes
126+
* @throws IOException if directory creation fails
127+
*/
128+
private void createStaleDiskBalancerDirs(VolumeSet volSet, String clusterId)
129+
throws IOException {
130+
List<StorageVolume> volumes = volSet.getVolumesList();
131+
for (StorageVolume volume : volumes) {
132+
if (volume instanceof HddsVolume) {
133+
HddsVolume hddsVolume = (HddsVolume) volume;
134+
File staleDiskBalancerDir;
135+
136+
File volumeTmpDir = hddsVolume.getTmpDir();
137+
if (volumeTmpDir != null) {
138+
// If tmpDir is initialized, use it directly
139+
staleDiskBalancerDir = new File(volumeTmpDir, DiskBalancerService.DISK_BALANCER_DIR);
140+
} else {
141+
// If tmpDir is not initialized, construct the path manually
142+
File clusterIdDir = new File(hddsVolume.getHddsRootDir(), clusterId);
143+
File tmpDirPath = new File(clusterIdDir, TMP_DIR_NAME);
144+
staleDiskBalancerDir = new File(tmpDirPath, DiskBalancerService.DISK_BALANCER_DIR);
145+
}
146+
147+
// Create stale directory with some content
148+
assertTrue(staleDiskBalancerDir.mkdirs(),
149+
"Failed to create stale diskBalancer directory: " + staleDiskBalancerDir.getAbsolutePath());
150+
File staleContainerDir = new File(staleDiskBalancerDir, "12345");
151+
assertTrue(staleContainerDir.mkdirs());
152+
// Verify stale directory exists before cleanup
153+
assertTrue(staleDiskBalancerDir.exists(),
154+
"Stale diskBalancer directory should exist before cleanup");
155+
}
156+
}
157+
}
158+
117159
@ContainerTestVersionInfo.ContainerTest
118160
public void testUpdateService(ContainerTestVersionInfo versionInfo) throws Exception {
119161
setLayoutAndSchemaForTest(versionInfo);
@@ -361,4 +403,116 @@ public void testDiskBalancerConfigurationThresholdValidation(double threshold,
361403
assertEquals(expectedThreshold, config.getThreshold(), 0.0001);
362404
}
363405
}
406+
407+
@ContainerTestVersionInfo.ContainerTest
408+
public void testDiskBalancerCleansUpStaleTmpDir(ContainerTestVersionInfo versionInfo) throws Exception {
409+
setLayoutAndSchemaForTest(versionInfo);
410+
// Start volumes to initialize tmp directories
411+
volumeSet.startAllVolume();
412+
413+
ContainerSet containerSet = ContainerSet.newReadOnlyContainerSet(1000);
414+
ContainerMetrics metrics = ContainerMetrics.create(conf);
415+
KeyValueHandler keyValueHandler =
416+
new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
417+
metrics, c -> {
418+
}, new ContainerChecksumTreeManager(conf));
419+
420+
// Create stale diskBalancer directories to simulate leftover from previous run
421+
createStaleDiskBalancerDirs(volumeSet, scmId);
422+
423+
// Use actual DiskBalancerService (not TestImpl) to test the real start() method
424+
OzoneContainer ozoneContainer = mockDependencies(containerSet, keyValueHandler, null);
425+
DiskBalancerService svc = new DiskBalancerService(ozoneContainer, 1000, 1000,
426+
TimeUnit.MILLISECONDS, 1, conf);
427+
428+
// Start the service, which should clean up stale tmp directories via cleanupTmpDir()
429+
svc.start();
430+
431+
// Verify stale diskBalancer tmp directories are cleaned up
432+
for (StorageVolume volume : volumeSet.getVolumesList()) {
433+
if (volume instanceof HddsVolume) {
434+
HddsVolume hddsVolume = (HddsVolume) volume;
435+
File volumeTmpDir = hddsVolume.getTmpDir();
436+
File diskBalancerTmpDir = new File(volumeTmpDir, DiskBalancerService.DISK_BALANCER_DIR);
437+
438+
// Verify stale directory is cleaned up (should not exist)
439+
assertFalse(diskBalancerTmpDir.exists(),
440+
"Stale diskBalancer tmp directory should be cleaned up on startup");
441+
}
442+
}
443+
444+
svc.shutdown();
445+
}
446+
447+
@ContainerTestVersionInfo.ContainerTest
448+
public void testDiskBalancerCleanupWhenTmpDirNotInitialized(ContainerTestVersionInfo versionInfo) throws Exception {
449+
setLayoutAndSchemaForTest(versionInfo);
450+
// Create a fresh volume set WITHOUT calling createDbInstancesForTestIfNeeded
451+
// This simulates volumes that are formatted but tmpDir is not initialized
452+
MutableVolumeSet testVolumeSet = new MutableVolumeSet(datanodeUuid, scmId, conf, null,
453+
StorageVolume.VolumeType.DATA_VOLUME, null);
454+
455+
// Format volumes and ensure clusterID directory exists, but DON'T create tmp dirs
456+
// This simulates the scenario where tmpDir is null
457+
for (StorageVolume volume : testVolumeSet.getVolumesList()) {
458+
if (volume instanceof HddsVolume) {
459+
HddsVolume hddsVolume = (HddsVolume) volume;
460+
// Format volume to set clusterID
461+
hddsVolume.format(scmId);
462+
// Manually create the clusterID directory (needed for tmpDir creation)
463+
// but don't call createWorkingDir() or createTmpDirs()
464+
File clusterIdDir = new File(hddsVolume.getHddsRootDir(), scmId);
465+
if (!clusterIdDir.exists()) {
466+
assertTrue(clusterIdDir.mkdirs(),
467+
"Failed to create clusterID directory: " + clusterIdDir.getAbsolutePath());
468+
}
469+
// Verify tmpDir is null (not initialized)
470+
assertNull(hddsVolume.getTmpDir());
471+
}
472+
}
473+
474+
// Create stale diskBalancer directories manually to simulate leftover from failed move
475+
// This tests the scenario where stale dirs exist even though tmpDir is not initialized
476+
createStaleDiskBalancerDirs(testVolumeSet, scmId);
477+
478+
ContainerSet containerSet = ContainerSet.newReadOnlyContainerSet(1000);
479+
ContainerMetrics metrics = ContainerMetrics.create(conf);
480+
KeyValueHandler keyValueHandler =
481+
new KeyValueHandler(conf, datanodeUuid, containerSet, testVolumeSet,
482+
metrics, c -> {
483+
}, new ContainerChecksumTreeManager(conf));
484+
485+
// Use actual DiskBalancerService (not TestImpl) to test the real start() method
486+
OzoneContainer ozoneContainer = mockDependencies(containerSet, keyValueHandler, null);
487+
// Override getVolumeSet to return our test volume set
488+
when(ozoneContainer.getVolumeSet()).thenReturn(testVolumeSet);
489+
490+
DiskBalancerService svc = new DiskBalancerService(ozoneContainer, 1000, 1000,
491+
TimeUnit.MILLISECONDS, 1, conf);
492+
493+
// Start the service - cleanup should handle volumes with uninitialized tmpDir
494+
// and clean up stale directories even when tmpDir is null
495+
svc.start();
496+
497+
// Verify stale directories are cleaned up even though tmpDir is not initialized
498+
List<StorageVolume> volumes = testVolumeSet.getVolumesList();
499+
for (StorageVolume volume : volumes) {
500+
if (volume instanceof HddsVolume) {
501+
HddsVolume hddsVolume = (HddsVolume) volume;
502+
// tmpDir should still be null - cleanup doesn't initialize it
503+
assertNull(hddsVolume.getTmpDir(),
504+
"tmpDir should not be initialized by cleanup, it will be created lazily");
505+
506+
// Verify stale diskBalancer directory is cleaned up
507+
File hddsRootDir = hddsVolume.getHddsRootDir();
508+
File expectedDiskBalancerTmpDir = new File(new File(hddsRootDir, scmId),
509+
TMP_DIR_NAME + File.separator + DiskBalancerService.DISK_BALANCER_DIR);
510+
assertFalse(expectedDiskBalancerTmpDir.exists(),
511+
"Stale diskBalancer directory should be cleaned up even when tmpDir is not initialized");
512+
}
513+
}
514+
515+
svc.shutdown();
516+
testVolumeSet.shutdown();
517+
}
364518
}

0 commit comments

Comments
 (0)