Skip to content

Commit 25a1f80

Browse files
authored
HDDS-13969. Add SCM metric for number of datanodes out of space (#9339)
1 parent 0a9df7b commit 25a1f80

3 files changed

Lines changed: 65 additions & 0 deletions

File tree

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
2323
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY;
2424
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY;
25+
import static org.apache.hadoop.hdds.scm.SCMCommonPlacementPolicy.hasEnoughSpace;
2526

2627
import com.google.common.annotations.VisibleForTesting;
2728
import com.google.common.base.Preconditions;
@@ -51,8 +52,10 @@
5152
import javax.management.ObjectName;
5253
import org.apache.hadoop.hdds.HddsConfigKeys;
5354
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
55+
import org.apache.hadoop.hdds.conf.StorageUnit;
5456
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
5557
import org.apache.hadoop.hdds.protocol.DatanodeID;
58+
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
5659
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState;
5760
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState;
5861
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.StorageTypeProto;
@@ -67,6 +70,7 @@
6770
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
6871
import org.apache.hadoop.hdds.scm.VersionInfo;
6972
import org.apache.hadoop.hdds.scm.container.ContainerID;
73+
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
7074
import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeMetric;
7175
import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat;
7276
import org.apache.hadoop.hdds.scm.events.SCMEvents;
@@ -84,6 +88,7 @@
8488
import org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager;
8589
import org.apache.hadoop.ipc.Server;
8690
import org.apache.hadoop.metrics2.util.MBeans;
91+
import org.apache.hadoop.ozone.OzoneConfigKeys;
8792
import org.apache.hadoop.ozone.OzoneConsts;
8893
import org.apache.hadoop.ozone.protocol.VersionResponse;
8994
import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
@@ -122,6 +127,7 @@ public class SCMNodeManager implements NodeManager {
122127
private final SCMNodeMetrics metrics;
123128
// Node manager MXBean
124129
private ObjectName nmInfoBean;
130+
private final OzoneConfiguration conf;
125131
private final SCMStorageConfig scmStorageConfig;
126132
private final NetworkTopology clusterMap;
127133
private final Function<String, String> nodeResolver;
@@ -170,6 +176,7 @@ public SCMNodeManager(
170176
SCMContext scmContext,
171177
HDDSLayoutVersionManager layoutVersionManager,
172178
Function<String, String> nodeResolver) {
179+
this.conf = conf;
173180
this.scmNodeEventPublisher = eventPublisher;
174181
this.nodeStateManager = new NodeStateManager(conf, eventPublisher,
175182
layoutVersionManager, scmContext);
@@ -1274,6 +1281,8 @@ public Map<String, String> getNodeStatistics() {
12741281
nodeStateStatistics(nodeStatistics);
12751282
// Statistics node space
12761283
nodeSpaceStatistics(nodeStatistics);
1284+
// Statistics node readOnly
1285+
nodeOutOfSpaceStatistics(nodeStatistics);
12771286
// todo: Statistics of other instances
12781287
return nodeStatistics;
12791288
}
@@ -1361,6 +1370,49 @@ private void nodeSpaceStatistics(Map<String, String> nodeStatics) {
13611370
nodeStatics.put(SpaceStatistics.NON_SCM_USED.getLabel(), nonScmUsed);
13621371
}
13631372

1373+
private void nodeOutOfSpaceStatistics(Map<String, String> nodeStatics) {
1374+
List<DatanodeInfo> allNodes = getAllNodes();
1375+
long blockSize = (long) conf.getStorageSize(
1376+
OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE,
1377+
OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE_DEFAULT,
1378+
StorageUnit.BYTES);
1379+
long minRatisVolumeSizeBytes = (long) conf.getStorageSize(
1380+
ScmConfigKeys.OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN,
1381+
ScmConfigKeys.OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN_DEFAULT,
1382+
StorageUnit.BYTES);
1383+
long containerSize = (long) conf.getStorageSize(
1384+
ScmConfigKeys.OZONE_SCM_CONTAINER_SIZE,
1385+
ScmConfigKeys.OZONE_SCM_CONTAINER_SIZE_DEFAULT,
1386+
StorageUnit.BYTES);
1387+
1388+
int nodeOutOfSpaceCount = (int) allNodes.parallelStream()
1389+
.filter(dn -> !hasEnoughSpace(dn, minRatisVolumeSizeBytes, containerSize, conf)
1390+
&& !hasContainerWithSpace(dn, blockSize, containerSize))
1391+
.count();
1392+
1393+
nodeStatics.put("NodesOutOfSpace", String.valueOf(nodeOutOfSpaceCount));
1394+
}
1395+
1396+
/**
1397+
* Check if a datanode has any OPEN container with enough space to accept new blocks.
1398+
*/
1399+
private boolean hasContainerWithSpace(DatanodeInfo dnInfo, long blockSize, long containerSize) {
1400+
try {
1401+
Set<ContainerID> containers = getContainers(dnInfo);
1402+
for (ContainerID containerID : containers) {
1403+
ContainerInfo containerInfo = scmContext.getScm().getContainerManager().getContainer(containerID);
1404+
1405+
if (containerInfo.getState() == HddsProtos.LifeCycleState.OPEN &&
1406+
containerInfo.getUsedBytes() + blockSize <= containerSize) {
1407+
return true;
1408+
}
1409+
}
1410+
} catch (Exception e) {
1411+
LOG.debug("Error checking containers for datanode {}: {}", dnInfo.getID(), e.getMessage());
1412+
}
1413+
return false;
1414+
}
1415+
13641416
/**
13651417
* Based on the current time and the last heartbeat, calculate the time difference
13661418
* and get a string of the relative value. E.g. "2s ago", "1m 2s ago", etc.

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ void incNumNodeCommandQueueReportProcessingFailed() {
132132
public void getMetrics(MetricsCollector collector, boolean all) {
133133
Map<String, Map<String, Integer>> nodeCount = managerMXBean.getNodeCount();
134134
Map<String, Long> nodeInfo = managerMXBean.getNodeInfo();
135+
Map<String, String> nodeStatistics = managerMXBean.getNodeStatistics();
135136
int totalNodeCount = 0;
136137
/**
137138
* Loop over the Node map and create a metric for the cross product of all
@@ -156,6 +157,15 @@ public void getMetrics(MetricsCollector collector, boolean all) {
156157
metrics.addGauge(
157158
Interns.info("AllNodes", "Number of datanodes"), totalNodeCount);
158159

160+
String nodesOutOfSpace = nodeStatistics.get("NodesOutOfSpace");
161+
if (nodesOutOfSpace != null) {
162+
metrics.addGauge(
163+
Interns.info("NodesOutOfSpace", "Number of datanodes that cannot accept new writes because " +
164+
"they lack either sufficient metadata space, data volume space for creating new containers " +
165+
"or free space in existing open containers."),
166+
Integer.parseInt(nodesOutOfSpace));
167+
}
168+
159169
for (Map.Entry<String, Long> e : nodeInfo.entrySet()) {
160170
metrics.addGauge(
161171
Interns.info(e.getKey(), diskMetricDescription(e.getKey())),

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeMetrics.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,9 @@ public void testNodeCountAndInfoMetricsReported() throws Exception {
226226
getMetrics(SCMNodeMetrics.class.getSimpleName()));
227227
assertGauge("AllNodes", 1,
228228
getMetrics(SCMNodeMetrics.class.getSimpleName()));
229+
// The DN has no metadata volumes, so hasEnoughSpace() returns false indicating the DN is out of space.
230+
assertGauge("NodesOutOfSpace", 1,
231+
getMetrics(SCMNodeMetrics.class.getSimpleName()));
229232
assertGauge("TotalCapacity", 100L,
230233
getMetrics(SCMNodeMetrics.class.getSimpleName()));
231234
assertGauge("TotalUsed", 10L,

0 commit comments

Comments
 (0)