Skip to content

Commit d370b51

Browse files
branch-4.1: [fix](cloud) avoid false tablet diagnosis alarms in cloud mode #60805 (#63461)
Cherry-picked from #60805 Co-authored-by: deardeng <dengxin@selectdb.com>
1 parent 3fdd7fe commit d370b51

2 files changed

Lines changed: 110 additions & 9 deletions

File tree

fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.doris.catalog.Tablet;
2727
import org.apache.doris.catalog.TabletInvertedIndex;
2828
import org.apache.doris.catalog.TabletMeta;
29+
import org.apache.doris.common.Config;
2930

3031
import com.google.common.collect.Lists;
3132
import org.json.simple.JSONObject;
@@ -114,6 +115,7 @@ public static List<List<String>> diagnoseTablet(long tabletId) {
114115
StringBuilder versionErr = new StringBuilder();
115116
StringBuilder statusErr = new StringBuilder();
116117
StringBuilder compactionErr = new StringBuilder();
118+
boolean isCloudMode = Config.isCloudMode();
117119
// for local mode, getCachedVisibleVersion return visibleVersion.
118120
// for cloud mode, the replica version is not updated.
119121
long visibleVersion = partition.getCachedVisibleVersion();
@@ -143,20 +145,22 @@ public static List<List<String>> diagnoseTablet(long tabletId) {
143145
+ replica.getBackendIdWithoutException() + " is not query available. ");
144146
break;
145147
}
146-
if (be.diskExceedLimit()) {
148+
if (!isCloudMode && be.diskExceedLimit()) {
147149
backendErr.append("Backend " + replica.getBackendIdWithoutException() + " has no space left. ");
148150
break;
149151
}
150152
} while (false);
151153
// version
152-
if (replica.getVersion() != visibleVersion) {
153-
versionErr.append("Replica on backend " + replica.getBackendIdWithoutException() + "'s version ("
154-
+ replica.getVersion() + ") does not equal"
155-
+ " to partition visible version (" + visibleVersion + ")");
156-
} else if (replica.getLastFailedVersion() != -1) {
157-
versionErr.append("Replica on backend "
158-
+ replica.getBackendIdWithoutException() + "'s last failed version is "
159-
+ replica.getLastFailedVersion());
154+
if (!isCloudMode) {
155+
if (replica.getVersion() != visibleVersion) {
156+
versionErr.append("Replica on backend " + replica.getBackendIdWithoutException() + "'s version ("
157+
+ replica.getVersion() + ") does not equal"
158+
+ " to partition visible version (" + visibleVersion + ")");
159+
} else if (replica.getLastFailedVersion() != -1) {
160+
versionErr.append("Replica on backend "
161+
+ replica.getBackendIdWithoutException() + "'s last failed version is "
162+
+ replica.getLastFailedVersion());
163+
}
160164
}
161165
// status
162166
if (!replica.isAlive() || replica.isUserDrop()) {

fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,15 @@
1717

1818
package org.apache.doris.clone;
1919

20+
import org.apache.doris.catalog.Database;
2021
import org.apache.doris.catalog.DiskInfo;
2122
import org.apache.doris.catalog.Env;
2223
import org.apache.doris.catalog.LocalTabletInvertedIndex;
24+
import org.apache.doris.catalog.MaterializedIndex;
25+
import org.apache.doris.catalog.OlapTable;
26+
import org.apache.doris.catalog.Partition;
2327
import org.apache.doris.catalog.Replica;
28+
import org.apache.doris.catalog.Tablet;
2429
import org.apache.doris.catalog.TabletInvertedIndex;
2530
import org.apache.doris.common.Config;
2631
import org.apache.doris.common.ExceptionChecker;
@@ -163,6 +168,98 @@ private static void updateReplicaVersionCount() {
163168
Assert.assertTrue(result.get(11).get(1).contains("version count is too high"));
164169
}
165170

171+
private static String getDiagnosisInfo(List<List<String>> rows, String item) {
172+
for (List<String> row : rows) {
173+
if (item.equals(row.get(0))) {
174+
return row.get(1);
175+
}
176+
}
177+
return "";
178+
}
179+
180+
private static Map<String, TDisk> copyBackendDisks(Backend backend) {
181+
Map<String, TDisk> disks = Maps.newHashMap();
182+
for (DiskInfo diskInfo : backend.getDisks().values()) {
183+
TDisk tDisk = new TDisk();
184+
tDisk.setRootPath(diskInfo.getRootPath());
185+
tDisk.setDiskTotalCapacity(diskInfo.getTotalCapacityB());
186+
tDisk.setDataUsedCapacity(diskInfo.getDataUsedCapacityB());
187+
tDisk.setTrashUsedCapacity(diskInfo.getTrashUsedCapacityB());
188+
tDisk.setDiskAvailableCapacity(diskInfo.getAvailableCapacityB());
189+
tDisk.setUsed(diskInfo.getState() == DiskInfo.DiskState.ONLINE);
190+
tDisk.setPathHash(diskInfo.getPathHash());
191+
tDisk.setStorageMedium(diskInfo.getStorageMedium());
192+
disks.put(tDisk.getRootPath(), tDisk);
193+
}
194+
return disks;
195+
}
196+
197+
private static Map<String, TDisk> buildExceedLimitDisks(Backend backend) {
198+
Map<String, TDisk> disks = Maps.newHashMap();
199+
for (DiskInfo diskInfo : backend.getDisks().values()) {
200+
TDisk tDisk = new TDisk();
201+
tDisk.setRootPath(diskInfo.getRootPath());
202+
tDisk.setDiskTotalCapacity(1L);
203+
tDisk.setDataUsedCapacity(1L);
204+
tDisk.setTrashUsedCapacity(0L);
205+
tDisk.setDiskAvailableCapacity(0L);
206+
tDisk.setUsed(true);
207+
tDisk.setPathHash(diskInfo.getPathHash());
208+
tDisk.setStorageMedium(diskInfo.getStorageMedium());
209+
disks.put(tDisk.getRootPath(), tDisk);
210+
}
211+
return disks;
212+
}
213+
214+
@Test
215+
public void testDiagnoseTabletCloudModeSkipDiskAndVersionCheck() throws Exception {
216+
String tableName = "tbl_diag_cloud_" + Math.abs(random.nextInt());
217+
String createStr = "create table test." + tableName + "\n"
218+
+ "(k1 date, k2 int)\n"
219+
+ "distributed by hash(k2) buckets 1\n"
220+
+ "properties\n"
221+
+ "(\n"
222+
+ " \"replication_num\" = \"3\"\n"
223+
+ ")";
224+
ExceptionChecker.expectThrowsNoException(() -> createTable(createStr));
225+
226+
Database db = Env.getCurrentInternalCatalog().getDbNullable("test");
227+
Assert.assertNotNull(db);
228+
OlapTable table = (OlapTable) db.getTableNullable(tableName);
229+
Assert.assertNotNull(table);
230+
Partition partition = table.getAllPartitions().iterator().next();
231+
MaterializedIndex index = partition.getBaseIndex();
232+
Tablet tablet = index.getTablets().get(0);
233+
Replica replica = tablet.getReplicas().get(0);
234+
long tabletId = tablet.getId();
235+
long visibleVersion = partition.getCachedVisibleVersion();
236+
Backend backend = Env.getCurrentSystemInfo().getBackend(replica.getBackendIdWithoutException());
237+
Assert.assertNotNull(backend);
238+
239+
Map<String, TDisk> originalDisks = copyBackendDisks(backend);
240+
String originCloudUniqueId = Config.cloud_unique_id;
241+
long originalVersion = replica.getVersion();
242+
243+
try {
244+
backend.updateDisks(buildExceedLimitDisks(backend));
245+
long mismatchVersion = visibleVersion == Long.MAX_VALUE ? visibleVersion - 1 : visibleVersion + 1;
246+
replica.adminUpdateVersionInfo(mismatchVersion, null, null, System.currentTimeMillis());
247+
248+
List<List<String>> localResult = Diagnoser.diagnoseTablet(tabletId);
249+
Assert.assertTrue(getDiagnosisInfo(localResult, "ReplicaBackendStatus").contains("has no space left"));
250+
Assert.assertTrue(getDiagnosisInfo(localResult, "ReplicaVersionStatus").contains("does not equal"));
251+
252+
Config.cloud_unique_id = "diagnose-tablet-cloud-mode-ut";
253+
List<List<String>> cloudResult = Diagnoser.diagnoseTablet(tabletId);
254+
Assert.assertEquals("OK", getDiagnosisInfo(cloudResult, "ReplicaBackendStatus"));
255+
Assert.assertEquals("OK", getDiagnosisInfo(cloudResult, "ReplicaVersionStatus"));
256+
} finally {
257+
Config.cloud_unique_id = originCloudUniqueId;
258+
backend.updateDisks(originalDisks);
259+
replica.adminUpdateVersionInfo(originalVersion, null, null, System.currentTimeMillis());
260+
}
261+
}
262+
166263
@Test
167264
public void test() throws Exception {
168265
// test colocate tablet repair

0 commit comments

Comments
 (0)