Skip to content

Commit 03f367b

Browse files
authored
HDDS-14119. Capture all container replication status in SCM container info (#9472)
1 parent d1f7fe3 commit 03f367b

41 files changed

Lines changed: 885 additions & 483 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.hadoop.hdds.scm.container;
19+
20+
import java.util.HashMap;
21+
import java.util.Map;
22+
23+
/**
24+
* Enum representing container health states.
25+
*
26+
* <p>Each health state has a unique short value (2 bytes), description, and metric name.
27+
* This enum provides named constants for:
28+
* - Individual health states (e.g., UNDER_REPLICATED)
29+
* - Actual combinations that occur in replication handlers
30+
*
31+
* <p>Individual states use values 0-99, combinations use values 100+.
32+
*
33+
* <p>This enum replaces ReplicationManagerReport.HealthState and is used both for
34+
* tracking container health in ContainerInfo and for metrics/reporting.
35+
*/
36+
public enum ContainerHealthState {
37+
38+
// ========== Individual Health States (0-99) ==========
39+
40+
/**
41+
* Container is healthy with no issues.
42+
*/
43+
HEALTHY((short) 0,
44+
"Container is healthy",
45+
"HealthyContainers"),
46+
47+
/**
48+
* Container has insufficient replicas.
49+
*/
50+
UNDER_REPLICATED((short) 1,
51+
"Containers with insufficient replicas",
52+
"UnderReplicatedContainers"),
53+
54+
/**
55+
* Container violates placement policy.
56+
*/
57+
MIS_REPLICATED((short) 2,
58+
"Containers with insufficient racks",
59+
"MisReplicatedContainers"),
60+
61+
/**
62+
* Container has excess replicas.
63+
*/
64+
OVER_REPLICATED((short) 3,
65+
"Containers with more replicas than required",
66+
"OverReplicatedContainers"),
67+
68+
/**
69+
* Critical: No online replicas available.
70+
*/
71+
MISSING((short) 4,
72+
"Containers with no online replicas",
73+
"MissingContainers"),
74+
75+
/**
76+
* Closed/Quasi-Closed with inconsistent replica states.
77+
*/
78+
UNHEALTHY((short) 5,
79+
"Containers Closed or Quasi_Closed having some replicas in a different state",
80+
"UnhealthyContainers"),
81+
82+
/**
83+
* Container has no blocks (metadata only).
84+
*/
85+
EMPTY((short) 6,
86+
"Containers having no blocks",
87+
"EmptyContainers"),
88+
89+
/**
90+
* Open container with inconsistent replica states.
91+
*/
92+
OPEN_UNHEALTHY((short) 7,
93+
"Containers open and having replicas with different states",
94+
"OpenUnhealthyContainers"),
95+
96+
/**
97+
* Container stuck in QUASI_CLOSED state.
98+
*/
99+
QUASI_CLOSED_STUCK((short) 8,
100+
"Containers QuasiClosed with insufficient datanode origins",
101+
"StuckQuasiClosedContainers"),
102+
103+
/**
104+
* Open container without healthy pipeline.
105+
*/
106+
OPEN_WITHOUT_PIPELINE((short) 9,
107+
"Containers in OPEN state without any healthy Pipeline",
108+
"OpenContainersWithoutPipeline"),
109+
110+
// ========== Actual Combinations Found in Code (100+) ==========
111+
112+
/**
113+
* Container is unhealthy AND under-replicated.
114+
* Occurs in RatisUnhealthyReplicationCheckHandler when container has only
115+
* unhealthy replicas and needs more replicas.
116+
*/
117+
UNHEALTHY_UNDER_REPLICATED((short) 100,
118+
"Inconsistent states with insufficient replicas",
119+
"UnhealthyUnderReplicatedContainers"),
120+
121+
/**
122+
* Container is unhealthy AND over-replicated.
123+
* Occurs in RatisUnhealthyReplicationCheckHandler when container has only
124+
* unhealthy replicas and has too many replicas.
125+
*/
126+
UNHEALTHY_OVER_REPLICATED((short) 101,
127+
"Inconsistent states with excess replicas",
128+
"UnhealthyOverReplicatedContainers"),
129+
130+
/**
131+
* Container is missing AND under-replicated.
132+
* Occurs in ECReplicationCheckHandler when EC container is unrecoverable (missing)
133+
* and also has unreplicated offline indexes needing replication.
134+
*/
135+
MISSING_UNDER_REPLICATED((short) 102,
136+
"No online replicas with offline indexes needing replication",
137+
"MissingUnderReplicatedContainers"),
138+
139+
/**
140+
* Container is quasi-closed-stuck AND under-replicated.
141+
* Occurs in QuasiClosedStuckReplicationCheck when container is stuck in QUASI_CLOSED
142+
* and also needs more replicas.
143+
*/
144+
QUASI_CLOSED_STUCK_UNDER_REPLICATED((short) 103,
145+
"Stuck in quasi-closed state with insufficient replicas",
146+
"QuasiClosedStuckUnderReplicatedContainers"),
147+
148+
/**
149+
* Container is quasi-closed-stuck AND over-replicated.
150+
* Occurs in QuasiClosedStuckReplicationCheck when container is stuck in QUASI_CLOSED
151+
* and also has excess replicas.
152+
*/
153+
QUASI_CLOSED_STUCK_OVER_REPLICATED((short) 104,
154+
"Stuck in quasi-closed state with excess replicas",
155+
"QuasiClosedStuckOverReplicatedContainers"),
156+
157+
/**
158+
* Container is quasi-closed-stuck AND missing (no replicas).
159+
* Occurs when QuasiClosedContainerHandler marks as QUASI_CLOSED_STUCK,
160+
* then QuasiClosedStuckReplicationCheck finds no replicas.
161+
*/
162+
QUASI_CLOSED_STUCK_MISSING((short) 105,
163+
"Stuck in quasi-closed state with no replicas",
164+
"QuasiClosedStuckMissingContainers");
165+
166+
// ========== Enum Fields ==========
167+
168+
private final short value;
169+
private final String description;
170+
private final String metricName;
171+
172+
// Static lookup map for efficient fromValue()
173+
private static final Map<Short, ContainerHealthState> VALUE_MAP = new HashMap<>();
174+
175+
static {
176+
for (ContainerHealthState state : values()) {
177+
VALUE_MAP.put(state.value, state);
178+
}
179+
}
180+
181+
// Constructor for all states
182+
ContainerHealthState(short value, String description, String metricName) {
183+
this.value = value;
184+
this.description = description;
185+
this.metricName = metricName;
186+
}
187+
188+
public short getValue() {
189+
return value;
190+
}
191+
192+
public String getDescription() {
193+
return description;
194+
}
195+
196+
/**
197+
* Get the metric name for this health state.
198+
* Used for metrics reporting in ReplicationManager.
199+
*
200+
* @return Metric name string
201+
*/
202+
public String getMetricName() {
203+
return metricName;
204+
}
205+
206+
// ========== Factory Methods ==========
207+
/**
208+
* Create health state from short value.
209+
* If value matches a named constant, returns that constant.
210+
* Otherwise, returns HEALTHY for unknown values.
211+
*
212+
* @param value The short value
213+
* @return ContainerHealthState enum constant
214+
*/
215+
public static ContainerHealthState fromValue(short value) {
216+
ContainerHealthState state = VALUE_MAP.get(value);
217+
return state != null ? state : HEALTHY;
218+
}
219+
220+
}

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerInfo.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ public final class ContainerInfo implements Comparable<ContainerInfo> {
8585
// The sequenceId of a close container cannot change, and all the
8686
// container replica should have the same sequenceId.
8787
private long sequenceId;
88+
// Health state of the container (determined by ReplicationManager)
89+
private ContainerHealthState healthState;
8890

8991
private ContainerInfo(Builder b) {
9092
containerID = ContainerID.valueOf(b.containerID);
@@ -99,6 +101,7 @@ private ContainerInfo(Builder b) {
99101
sequenceId = b.sequenceId;
100102
replicationConfig = b.replicationConfig;
101103
clock = b.clock;
104+
healthState = b.healthState != null ? b.healthState : ContainerHealthState.HEALTHY;
102105
}
103106

104107
public static Codec<ContainerInfo> getCodec() {
@@ -242,6 +245,24 @@ public void updateLastUsedTime() {
242245
lastUsed = clock.instant();
243246
}
244247

248+
/**
249+
* Get the health state of the container.
250+
*
251+
* @return ContainerHealthState
252+
*/
253+
public ContainerHealthState getHealthState() {
254+
return healthState;
255+
}
256+
257+
/**
258+
* Set the health state of the container.
259+
*
260+
* @param newHealthState The new health state
261+
*/
262+
public void setHealthState(ContainerHealthState newHealthState) {
263+
this.healthState = newHealthState;
264+
}
265+
245266
@JsonIgnore
246267
public HddsProtos.ContainerInfoProto getProtobuf() {
247268
HddsProtos.ContainerInfoProto.Builder builder =
@@ -269,6 +290,7 @@ public HddsProtos.ContainerInfoProto getProtobuf() {
269290
if (getPipelineID() != null) {
270291
builder.setPipelineID(getPipelineID().getProtobuf());
271292
}
293+
272294
return builder.build();
273295
}
274296

@@ -370,6 +392,7 @@ public static class Builder {
370392
private long sequenceId;
371393
private PipelineID pipelineID;
372394
private ReplicationConfig replicationConfig;
395+
private ContainerHealthState healthState;
373396

374397
public Builder setPipelineID(PipelineID pipelineId) {
375398
this.pipelineID = pipelineId;
@@ -422,6 +445,11 @@ public Builder setSequenceId(long sequenceID) {
422445
return this;
423446
}
424447

448+
public Builder setHealthState(ContainerHealthState healthState) {
449+
this.healthState = healthState;
450+
return this;
451+
}
452+
425453
/**
426454
* Also resets {@code stateEnterTime}, so make sure to set clock first.
427455
*/

0 commit comments

Comments
 (0)