Skip to content

Commit 48613a7

Browse files
sreejasahithiSreeja Chintalapati
andauthored
HDDS-15455. Implement Custom DataNode Container Directory Discovery and Duplicate Detection (#10414).
Co-authored-by: Sreeja Chintalapati <schintalapati@Sreejas-MacBook-Pro.local>
1 parent 1e0ea62 commit 48613a7

12 files changed

Lines changed: 1256 additions & 25 deletions

File tree

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/StorageVolumeUtil.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.hadoop.hdds.conf.ConfigurationSource;
2929
import org.apache.hadoop.ozone.OzoneConsts;
3030
import org.apache.hadoop.ozone.common.InconsistentStorageStateException;
31+
import org.apache.hadoop.ozone.common.Storage;
3132
import org.apache.hadoop.ozone.container.common.HDDSVolumeLayoutVersion;
3233
import org.apache.hadoop.ozone.container.common.volume.DbVolume;
3334
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
@@ -274,4 +275,31 @@ public static boolean checkVolume(StorageVolume volume, String scmId,
274275

275276
return success;
276277
}
278+
279+
public static File resolveContainerCurrentDir(
280+
File hddsRoot, String clusterId, File[] storageDirs)
281+
throws InconsistentStorageStateException {
282+
283+
File clusterIdDir = new File(hddsRoot, clusterId);
284+
//The subdirectory we should verify containers within.
285+
// If this volume was formatted pre SCM HA, this will be the SCM ID.
286+
// A cluster ID symlink will exist in this case only if this cluster is
287+
// finalized for SCM HA.
288+
// If the volume was formatted post SCM HA, this will be the cluster ID.
289+
File idDir = clusterIdDir;
290+
291+
if (storageDirs.length == 1 && !clusterIdDir.exists()) {
292+
// If the one directory is not the cluster ID directory, assume it is
293+
// the old SCM ID directory used before SCM HA.
294+
idDir = storageDirs[0];
295+
} else if (!clusterIdDir.exists()) {
296+
// There are 1 or more storage directories. We only care about the
297+
// cluster ID directory.
298+
throw new InconsistentStorageStateException(
299+
"Volume " + hddsRoot + " is in an inconsistent state. Expected cluster ID directory "
300+
+ clusterIdDir + " not found.");
301+
}
302+
303+
return new File(idDir, Storage.STORAGE_DIR_CURRENT);
304+
}
277305
}

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerReader.java

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@
2828
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
2929
import org.apache.hadoop.hdds.scm.container.ContainerID;
3030
import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
31-
import org.apache.hadoop.ozone.common.Storage;
31+
import org.apache.hadoop.ozone.common.InconsistentStorageStateException;
3232
import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils;
3333
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
3434
import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml;
3535
import org.apache.hadoop.ozone.container.common.impl.ContainerSet;
36+
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
3637
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
3738
import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet;
3839
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer;
@@ -119,32 +120,19 @@ public void readVolume(File hddsVolumeRootDir) {
119120
// by HddsUtil#checkVolume once we have a cluster ID from SCM. No
120121
// operations to perform here in that case.
121122
if (storageDirs.length > 0) {
122-
File clusterIDDir = new File(hddsVolumeRootDir,
123-
hddsVolume.getClusterID());
124-
// The subdirectory we should verify containers within.
125-
// If this volume was formatted pre SCM HA, this will be the SCM ID.
126-
// A cluster ID symlink will exist in this case only if this cluster is
127-
// finalized for SCM HA.
128-
// If the volume was formatted post SCM HA, this will be the cluster ID.
129-
File idDir = clusterIDDir;
130-
if (storageDirs.length == 1 && !clusterIDDir.exists()) {
131-
// If the one directory is not the cluster ID directory, assume it is
132-
// the old SCM ID directory used before SCM HA.
133-
idDir = storageDirs[0];
134-
} else {
135-
// There are 1 or more storage directories. We only care about the
136-
// cluster ID directory.
137-
if (!clusterIDDir.exists()) {
138-
LOG.error("Volume {} is in an inconsistent state. Expected " +
139-
"clusterID directory {} not found.", hddsVolumeRootDir,
140-
clusterIDDir);
141-
volumeSet.failVolume(hddsVolumeRootDir.getPath());
142-
return;
143-
}
123+
File currentDir;
124+
try {
125+
currentDir = StorageVolumeUtil.resolveContainerCurrentDir(hddsVolumeRootDir,
126+
hddsVolume.getClusterID(), storageDirs);
127+
} catch (InconsistentStorageStateException e) {
128+
LOG.error("Volume {} is in an inconsistent state. Expected " +
129+
"clusterID directory {} not found.", hddsVolumeRootDir,
130+
new File(hddsVolumeRootDir, hddsVolume.getClusterID()));
131+
volumeSet.failVolume(hddsVolumeRootDir.getPath());
132+
return;
144133
}
145134

146135
LOG.info("Start to verify containers on volume {}", hddsVolumeRootDir);
147-
File currentDir = new File(idDir, Storage.STORAGE_DIR_CURRENT);
148136
File[] containerTopDirs = currentDir.listFiles();
149137
if (containerTopDirs != null && containerTopDirs.length > 0) {
150138
for (File containerTopDir : containerTopDirs) {

hadoop-ozone/cli-debug/src/main/java/org/apache/hadoop/ozone/debug/datanode/container/ContainerCommands.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController;
5959
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerReader;
6060
import org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures;
61+
import org.apache.hadoop.ozone.debug.datanode.container.analyze.AnalyzeSubcommand;
6162
import org.slf4j.Logger;
6263
import org.slf4j.LoggerFactory;
6364
import picocli.CommandLine.Command;
@@ -75,7 +76,8 @@
7576
ListSubcommand.class,
7677
InfoSubcommand.class,
7778
ExportSubcommand.class,
78-
InspectSubcommand.class
79+
InspectSubcommand.class,
80+
AnalyzeSubcommand.class
7981
})
8082
public class ContainerCommands extends AbstractSubcommand {
8183

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.hadoop.ozone.debug.datanode.container.analyze;
19+
20+
import java.io.IOException;
21+
import java.util.List;
22+
import java.util.Map;
23+
import java.util.concurrent.Callable;
24+
import org.apache.hadoop.hdds.cli.AbstractSubcommand;
25+
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
26+
import picocli.CommandLine;
27+
import picocli.CommandLine.Command;
28+
29+
/**
30+
* {@code ozone debug datanode container analyze}.
31+
*
32+
* <p>Compares on-disk container directories on this DataNode against SCM
33+
* metadata to report inconsistencies.
34+
*/
35+
@Command(
36+
name = "analyze",
37+
description = "Analyze container consistency between on-disk container " +
38+
"directories on this DataNode and SCM metadata. Must be run locally on a DataNode.")
39+
public class AnalyzeSubcommand extends AbstractSubcommand implements Callable<Void> {
40+
@CommandLine.Option(names = {"--count"},
41+
defaultValue = "20",
42+
description = "Number of containers to display")
43+
private int count;
44+
45+
@Override
46+
public Void call() throws Exception {
47+
if (count < 1) {
48+
throw new IOException("Count must be an integer greater than 0.");
49+
}
50+
OzoneConfiguration conf = getOzoneConf();
51+
ContainerScanResult scanResult = ContainerDirectoryScanner.scan(conf);
52+
Map<Long, List<ContainerDiskOccurrence>> enrichedDuplicates =
53+
ContainerDirectoryScanner.enrichDuplicates(scanResult.getDuplicates());
54+
55+
// TODO: SCM metadata lookup from --scm-db when provided.
56+
// TODO: For each id in scanResult.getSingles().keySet() classified NOT_IN_SCM or DELETED:
57+
// enrichOccurrence(id, scanResult.getSingles().get(id)) and report.
58+
// TODO: For each id in enrichedDuplicates.keySet() classified NOT_IN_SCM or DELETED:
59+
// enrichedDuplicates.get(id) is already enriched — just report.
60+
61+
printDuplicates(enrichedDuplicates);
62+
printVolumeScanErrors(scanResult.getVolumeScanErrors());
63+
return null;
64+
}
65+
66+
private void printDuplicates(Map<Long, List<ContainerDiskOccurrence>> duplicates) {
67+
long totalDuplicateIds = duplicates.size();
68+
out().printf("Number of containers with duplicate container directories on this DataNode: %d%n", totalDuplicateIds);
69+
70+
if (totalDuplicateIds == 0) {
71+
return;
72+
}
73+
74+
if (totalDuplicateIds > count) {
75+
out().printf("Showing first %d:%n", count);
76+
}
77+
78+
duplicates.entrySet().stream()
79+
.sorted(Map.Entry.comparingByKey())
80+
.limit(count)
81+
.forEach(entry -> {
82+
long containerId = entry.getKey();
83+
List<ContainerDiskOccurrence> occurrences = entry.getValue();
84+
out().printf("Container %d (%d occurrences):%n", containerId, occurrences.size());
85+
for (ContainerDiskOccurrence o : occurrences) {
86+
out().printf(" path=%s%n", o.getContainerPath());
87+
if (o.isSizeKnown()) {
88+
out().printf(" status=%s size=%d bytes%n", o.getStatus(), o.getSizeBytes());
89+
} else {
90+
out().printf(" status=%s size=unavailable (failed to compute directory size)%n",
91+
o.getStatus());
92+
}
93+
out().println();
94+
}
95+
});
96+
}
97+
98+
private void printVolumeScanErrors(List<String> volumeScanErrors) {
99+
if (volumeScanErrors.isEmpty()) {
100+
return;
101+
}
102+
err().printf("%nVolumes that failed to scan (%d):%n", volumeScanErrors.size());
103+
for (String error : volumeScanErrors) {
104+
err().printf(" %s%n", error);
105+
}
106+
}
107+
}

0 commit comments

Comments
 (0)