[ISSUE #10373] Quarantine flaky tests and add detection plan docs

lizhimins · claude · lizhimins · commit ef7327d283f4 · 2026-05-25T10:53:29.000+08:00
Ran all RocketMQ module tests 100x across 10 ECS nodes to identify non-deterministic failures. Quarantined methods with @ignore across broker, client, filter, and tieredstore modules. Flaky tests quarantined: - broker: LiteLifecycleManagerTest#testCleanByParentTopic (2%) - broker: ConsumerOrderInfoManagerLockFreeNotifyTest#testRecover (2%) - broker: TransactionalMessageServiceImplTest#testDeletePrepareMessage_maxSize (1%) - client: DefaultMQConsumerWithTraceTest#testPullMessage_WithTrace_Success (1%) - client: DefaultMQLitePullConsumerWithTraceTest#testSubscribe_PollMessageSuccess_WithCustomizedTraceTopic (5%) - client: DefaultMQLitePullConsumerWithTraceTest#testSubscribe_PollMessageSuccess_WithDefaultTraceTopic (6%) - filter: BloomFilterTest#testCheckFalseHit (1%) - tieredstore: IndexStoreServiceTest#queryCrossFileBoundaryTest (35%) - tieredstore: IndexStoreServiceTest#concurrentGetTest (1.5%) Additional changes: - LiteLifecycleManagerTest: Switch to MockitoJUnitRunner.Silent - Add flaky test detection plan docs (CN + EN) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/broker/src/test/java/org/apache/rocketmq/broker/lite/LiteLifecycleManagerTest.java b/broker/src/test/java/org/apache/rocketmq/broker/lite/LiteLifecycleManagerTest.java
@@ -53,7 +53,7 @@
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.when;
 
-@RunWith(MockitoJUnitRunner.class)
+@RunWith(MockitoJUnitRunner.Silent.class)
 public class LiteLifecycleManagerTest {
 
     private final static BrokerConfig BROKER_CONFIG = new BrokerConfig();
@@ -177,6 +177,7 @@ public void testCleanExpiredLiteTopic() {
         }
     }
 
+    @Ignore("Flaky: fails 2/100 runs (2.0%)")
     @Test
     public void testCleanByParentTopic() {
         int num = 3;
diff --git a/broker/src/test/java/org/apache/rocketmq/broker/pop/orderly/ConsumerOrderInfoManagerLockFreeNotifyTest.java b/broker/src/test/java/org/apache/rocketmq/broker/pop/orderly/ConsumerOrderInfoManagerLockFreeNotifyTest.java
@@ -35,6 +35,7 @@
 import static org.mockito.Mockito.doAnswer;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
+import org.junit.Ignore;
 
 public class ConsumerOrderInfoManagerLockFreeNotifyTest {
 
@@ -156,6 +157,7 @@ public void testConsumeTheChangeInvisibleShorter() {
         assertTrue(consumerOrderInfoManager.getConsumerOrderInfoLockManager().getTimeoutMap().isEmpty());
     }
 
+    @Ignore("Flaky: fails 2/100 runs (2.0%)")
     @Test
     public void testRecover() {
         QueueLevelConsumerManager savedConsumerOrderInfoManager = new QueueLevelConsumerManager();
diff --git a/broker/src/test/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImplTest.java b/broker/src/test/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImplTest.java
@@ -61,6 +61,7 @@
 import static org.mockito.Mockito.timeout;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
+import org.junit.Ignore;
 
 @RunWith(MockitoJUnitRunner.class)
 public class TransactionalMessageServiceImplTest {
@@ -162,6 +163,7 @@ public void testDeletePrepareMessage_queueFull() throws InterruptedException {
         assertThat(res).isFalse();
     }
 
+    @Ignore("Flaky: fails 1/100 runs (1.0%)")
     @Test
     public void testDeletePrepareMessage_maxSize() throws InterruptedException {
         brokerController.getBrokerConfig().setTransactionOpMsgMaxSize(1);
diff --git a/client/src/test/java/org/apache/rocketmq/client/trace/DefaultMQConsumerWithTraceTest.java b/client/src/test/java/org/apache/rocketmq/client/trace/DefaultMQConsumerWithTraceTest.java
@@ -88,6 +88,7 @@
 import static org.mockito.Mockito.doReturn;
 import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.when;
+import org.junit.Ignore;
 
 @RunWith(MockitoJUnitRunner.Silent.class)
 public class DefaultMQConsumerWithTraceTest {
@@ -216,6 +217,7 @@ public void terminate() {
         pushConsumer.shutdown();
     }
 
+    @Ignore("Flaky: fails 1/100 runs (1.0%)")
     @Test
     public void testPullMessage_WithTrace_Success() throws InterruptedException, RemotingException, MQBrokerException, MQClientException {
         traceProducer.getDefaultMQProducerImpl().getMqClientFactory().registerProducer(producerGroupTraceTemp, traceProducer.getDefaultMQProducerImpl());
diff --git a/client/src/test/java/org/apache/rocketmq/client/trace/DefaultMQLitePullConsumerWithTraceTest.java b/client/src/test/java/org/apache/rocketmq/client/trace/DefaultMQLitePullConsumerWithTraceTest.java
@@ -80,6 +80,7 @@
 import static org.mockito.Mockito.lenient;
 import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.when;
+import org.junit.Ignore;
 
 @RunWith(MockitoJUnitRunner.class)
 public class DefaultMQLitePullConsumerWithTraceTest {
@@ -153,6 +154,7 @@ public void destroy() {
         }
     }
 
+    @Ignore("Flaky: fails 6/100 runs (6.0%)")
     @Test
     public void testSubscribe_PollMessageSuccess_WithDefaultTraceTopic() throws Exception {
         DefaultLitePullConsumer litePullConsumer = createLitePullConsumerWithDefaultTraceTopic();
@@ -169,6 +171,7 @@ public void testSubscribe_PollMessageSuccess_WithDefaultTraceTopic() throws Exce
         }
     }
 
+    @Ignore("Flaky: fails 5/100 runs (5.0%)")
     @Test
     public void testSubscribe_PollMessageSuccess_WithCustomizedTraceTopic() throws Exception {
         DefaultLitePullConsumer litePullConsumer = createLitePullConsumerWithCustomizedTraceTopic();
diff --git a/docs/cn/flaky-test-detector-plan-cn.md b/docs/cn/flaky-test-detector-plan-cn.md
@@ -0,0 +1,56 @@
+# RocketMQ Flaky Test 检测方案
+
+## 背景与目标
+
+RocketMQ 主干 CI 经常出现间歇性测试失败，导致开发者对红色构建产生信任疲劳，真正的回归问题容易被掩盖。本方案通过大规模重复执行统计失败率，对不稳定方法（≥1%）标记 `@Ignore` 恢复 CI 可靠性，同时保留数据为后续修复提供优先级依据。
+
+## 方法论来源
+
+- **Google** — [Flaky Tests at Google and How We Mitigate Them](https://testing.googleblog.com/2016/05/flaky-tests-at-google-and-how-we.html)（2016）：提出 "deflake"（重复执行 N 次统计失败率）和 "quarantine"（将不稳定测试从主线 CI 隔离）。内部数据：约 1.5% 的测试存在 flakiness，16% 曾经 flaky 过。
+- **Meta** — [Predictive Test Selection](https://engineering.fb.com/2018/11/21/developer-tools/predictive-test-selection/)（2018）：通过 aggressive retry 区分 flaky 失败与真实回归。
+- **Spotify** — [Test Flakiness Methods](https://engineering.atspotify.com/2019/11/test-flakiness-methods-for-identifying-and-dealing-with-flaky-tests/)（2019）：重复执行 + 隔离 + 追踪的三阶段治理框架。
+
+## 核心思路：三层漏斗
+
+采用"粗筛 → 精筛 → 定位"逐步缩小范围，避免在全量方法级别浪费算力：
+
+```
+第一层：模块级（16 模块 × 100 次）→ 筛出有失败的模块
+第二层：类级（仅不稳定模块中的测试类 × 100 次）→ 筛出有失败的类
+第三层：方法级（仅不稳定类中的测试方法 × 100 次）→ 精确定位每个方法的失败率
+```
+
+每层执行完后分析 Surefire XML 报告，输出不稳定列表作为下一层的输入。标记后重新全量执行验证，如仍有新 flaky 出现则循环标记 + 验证，直到零失败。
+
+## 执行架构
+
+- **控制节点（本地）**：编排任务分发、结果收集、数据分析
+- **工作节点（10 台 ECS，16C 64G）**：每台最多 4 个 Docker 容器并行执行测试，互不干扰
+
+## 执行流程
+
+```
+1. 构建  → Docker 内 JDK 8 编译 RocketMQ，打包为测试镜像
+2. 分发  → 内网中转分发镜像到所有工作节点
+3. 派发  → 生成任务列表，均匀拆分到各节点，启动 worker
+4. 收集  → 轮询等待完成，回收 Surefire XML 报告
+5. 分析  → 解析 XML，统计失败次数和失败率
+6. 标记  → 对超过阈值的方法添加 @Ignore
+7. 验证  → 重新构建并全量执行，确认主干稳定
+```
+
+## 关键设计决策
+
+| 决策点 | 选择 | 原因 |
+|--------|------|------|
+| 编译环境 | Docker 内 JDK 8 | 本地 JDK 版本不一致，容器内保证一致性 |
+| 镜像分发 | 先传一台再内网中转 | 内网带宽远大于公网 |
+| 测试隔离 | 每轮独立容器 | 避免进程残留、端口占用等干扰 |
+| 失败判定 | ≥1% 失败率 | 1000 次有效执行下 1% 约 10 次失败，平衡误判与漏判 |
+| 标记方式 | `@Ignore` + 失败率注释 | 最小侵入，方便后续逐个启用 |
+| 验证循环 | 标记后重新全量跑 | 处理"隐藏 flaky"问题 |
+
+## 后续计划
+
+- 对高失败率方法（>10%）优先根因分析并修复，修复后移除 `@Ignore` 并重新验证
+- 考虑将检测工具集成到定期 CI 任务中，持续监控测试稳定性
diff --git a/docs/cn/flaky-test-detector-plan-en.md b/docs/cn/flaky-test-detector-plan-en.md
@@ -0,0 +1,56 @@
+# RocketMQ Flaky Test Detection Plan
+
+## Background & Goals
+
+RocketMQ's mainline CI frequently experiences intermittent test failures, causing developer trust fatigue toward red builds and masking real regressions. This plan uses large-scale repeated execution to statistically measure failure rates, marks unstable methods (≥1%) with `@Ignore` to restore CI reliability, and retains the data to prioritize subsequent fixes.
+
+## Methodology References
+
+- **Google** — [Flaky Tests at Google and How We Mitigate Them](https://testing.googleblog.com/2016/05/flaky-tests-at-google-and-how-we.html) (2016): Introduced "deflake" (run N times to measure failure rate) and "quarantine" (isolate flaky tests from mainline CI). Internal data: ~1.5% of tests are flaky, 16% have been flaky at some point.
+- **Meta** — [Predictive Test Selection](https://engineering.fb.com/2018/11/21/developer-tools/predictive-test-selection/) (2018): Uses aggressive retry to separate flaky failures from real regressions.
+- **Spotify** — [Test Flakiness Methods](https://engineering.atspotify.com/2019/11/test-flakiness-methods-for-identifying-and-dealing-with-flaky-tests/) (2019): Three-stage framework of repeated execution + isolation + tracking.
+
+## Core Idea: Three-Layer Funnel
+
+A "coarse → fine → pinpoint" strategy to progressively narrow scope and avoid wasting compute at the full method level:
+
+```
+Layer 1: Module level (16 modules × 100 runs) → filter out modules with failures
+Layer 2: Class level (only classes in unstable modules × 100 runs) → filter out classes with failures
+Layer 3: Method level (only methods in unstable classes × 100 runs) → precisely locate each method's failure rate
+```
+
+After each layer, Surefire XML reports are analyzed and the unstable list feeds the next layer. After marking, a full re-run verifies stability; if new flaky tests surface, the mark + verify cycle repeats until zero failures.
+
+## Execution Architecture
+
+- **Control node (local)**: Orchestrates task distribution, result collection, data analysis
+- **Worker nodes (10 ECS, 16C 64G each)**: Max 4 Docker containers per node in parallel, each test run isolated
+
+## Execution Flow
+
+```
+1. Build      → Compile RocketMQ with JDK 8 inside Docker, package as test image
+2. Distribute → Relay image via internal network to all worker nodes
+3. Dispatch   → Generate task list, split evenly across nodes, start workers
+4. Collect    → Poll until complete, retrieve Surefire XML reports
+5. Analyze    → Parse XML, compute failure count and rate per method
+6. Mark       → Add @Ignore to methods exceeding threshold
+7. Verify     → Rebuild and run full suite to confirm trunk stability
+```
+
+## Key Design Decisions
+
+| Decision Point | Choice | Rationale |
+|---------------|--------|-----------|
+| Build environment | JDK 8 inside Docker | Local JDK versions vary; container ensures consistency |
+| Image distribution | Upload to one node, relay via internal network | Internal bandwidth far exceeds public internet |
+| Test isolation | Independent container per run | Avoids residual processes, port conflicts |
+| Failure threshold | ≥1% failure rate | ~10 failures across 1000 effective runs; balances false positives vs. missed cases |
+| Marking approach | `@Ignore` + failure rate comment | Minimal intrusion, easy to re-enable later |
+| Verification loop | Full re-run after marking | Handles "hidden flaky" problem |
+
+## Follow-up Plan
+
+- Prioritize root-cause analysis and fix for high failure rate methods (>10%); remove `@Ignore` and re-verify after fix
+- Consider integrating the detection tool into periodic CI tasks for continuous stability monitoring
diff --git a/filter/src/test/java/org/apache/rocketmq/filter/BloomFilterTest.java b/filter/src/test/java/org/apache/rocketmq/filter/BloomFilterTest.java
@@ -25,6 +25,7 @@
 import java.util.Random;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import org.junit.Ignore;
 
 public class BloomFilterTest {
 
@@ -142,6 +143,7 @@ public void testBloomFilterData() {
         assertThat(bloomFilter.isValid(bloomFilterData)).isFalse();
     }
 
+    @Ignore("Flaky: fails 1/100 runs (1.0%)")
     @Test
     public void testCheckFalseHit() {
         BloomFilter bloomFilter = BloomFilter.createByFn(1, 300);
diff --git a/tieredstore/src/test/java/org/apache/rocketmq/tieredstore/index/IndexStoreServiceTest.java b/tieredstore/src/test/java/org/apache/rocketmq/tieredstore/index/IndexStoreServiceTest.java
@@ -53,6 +53,7 @@
 import org.slf4j.LoggerFactory;
 
 import static org.awaitility.Awaitility.await;
+import org.junit.Ignore;
 
 public class IndexStoreServiceTest {
 
@@ -308,6 +309,7 @@ public void queryFromFileTest() throws InterruptedException, ExecutionException
         }
     }
 
+    @Ignore("Flaky: fails 1/65 runs (1.5%)")
     @Test
     public void concurrentGetTest() throws InterruptedException {
         storeConfig.setTieredStoreIndexFileMaxIndexNum(2000);
@@ -352,6 +354,7 @@ public void concurrentGetTest() throws InterruptedException {
         Assert.assertTrue(result.get());
     }
 
+    @Ignore("Flaky: fails 35/100 runs (35.0%)")
     @Test
     public void queryCrossFileBoundaryTest() throws InterruptedException, ExecutionException {
         indexService = new IndexStoreService(fileAllocator, filePath);

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@`
`53`	`53`	`import static org.mockito.ArgumentMatchers.eq;`
`54`	`54`	`import static org.mockito.Mockito.when;`
`55`	`55`
`56`		`-@RunWith(MockitoJUnitRunner.class)`
	`56`	`+@RunWith(MockitoJUnitRunner.Silent.class)`
`57`	`57`	`public class LiteLifecycleManagerTest {`
`58`	`58`
`59`	`59`	`private final static BrokerConfig BROKER_CONFIG = new BrokerConfig();`
`@@ -177,6 +177,7 @@ public void testCleanExpiredLiteTopic() {`
`177`	`177`	`}`
`178`	`178`	`}`
`179`	`179`
	`180`	`+ @Ignore("Flaky: fails 2/100 runs (2.0%)")`
`180`	`181`	`@Test`
`181`	`182`	`public void testCleanByParentTopic() {`
`182`	`183`	`int num = 3;`
Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,7 @@`
`80`	`80`	`import static org.mockito.Mockito.lenient;`
`81`	`81`	`import static org.mockito.Mockito.spy;`
`82`	`82`	`import static org.mockito.Mockito.when;`
	`83`	`+import org.junit.Ignore;`
`83`	`84`
`84`	`85`	`@RunWith(MockitoJUnitRunner.class)`
`85`	`86`	`public class DefaultMQLitePullConsumerWithTraceTest {`
`@@ -153,6 +154,7 @@ public void destroy() {`
`153`	`154`	`}`
`154`	`155`	`}`
`155`	`156`
	`157`	`+ @Ignore("Flaky: fails 6/100 runs (6.0%)")`
`156`	`158`	`@Test`
`157`	`159`	`public void testSubscribe_PollMessageSuccess_WithDefaultTraceTopic() throws Exception {`
`158`	`160`	`DefaultLitePullConsumer litePullConsumer = createLitePullConsumerWithDefaultTraceTopic();`
`@@ -169,6 +171,7 @@ public void testSubscribe_PollMessageSuccess_WithDefaultTraceTopic() throws Exce`
`169`	`171`	`}`
`170`	`172`	`}`
`171`	`173`
	`174`	`+ @Ignore("Flaky: fails 5/100 runs (5.0%)")`
`172`	`175`	`@Test`
`173`	`176`	`public void testSubscribe_PollMessageSuccess_WithCustomizedTraceTopic() throws Exception {`
`174`	`177`	`DefaultLitePullConsumer litePullConsumer = createLitePullConsumerWithCustomizedTraceTopic();`
Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@`
`53`	`53`	`import org.slf4j.LoggerFactory;`
`54`	`54`
`55`	`55`	`import static org.awaitility.Awaitility.await;`
	`56`	`+import org.junit.Ignore;`
`56`	`57`
`57`	`58`	`public class IndexStoreServiceTest {`
`58`	`59`
`@@ -308,6 +309,7 @@ public void queryFromFileTest() throws InterruptedException, ExecutionException`
`308`	`309`	`}`
`309`	`310`	`}`
`310`	`311`
	`312`	`+ @Ignore("Flaky: fails 1/65 runs (1.5%)")`
`311`	`313`	`@Test`
`312`	`314`	`public void concurrentGetTest() throws InterruptedException {`
`313`	`315`	`storeConfig.setTieredStoreIndexFileMaxIndexNum(2000);`
`@@ -352,6 +354,7 @@ public void concurrentGetTest() throws InterruptedException {`
`352`	`354`	`Assert.assertTrue(result.get());`
`353`	`355`	`}`
`354`	`356`
	`357`	`+ @Ignore("Flaky: fails 35/100 runs (35.0%)")`
`355`	`358`	`@Test`
`356`	`359`	`public void queryCrossFileBoundaryTest() throws InterruptedException, ExecutionException {`
`357`	`360`	`indexService = new IndexStoreService(fileAllocator, filePath);`