Skip to content

Commit ef7327d

Browse files
lizhiminsclaude
andcommitted
[ISSUE #10373] Quarantine flaky tests and add detection plan docs
Ran all RocketMQ module tests 100x across 10 ECS nodes to identify non-deterministic failures. Quarantined methods with @ignore across broker, client, filter, and tieredstore modules. Flaky tests quarantined: - broker: LiteLifecycleManagerTest#testCleanByParentTopic (2%) - broker: ConsumerOrderInfoManagerLockFreeNotifyTest#testRecover (2%) - broker: TransactionalMessageServiceImplTest#testDeletePrepareMessage_maxSize (1%) - client: DefaultMQConsumerWithTraceTest#testPullMessage_WithTrace_Success (1%) - client: DefaultMQLitePullConsumerWithTraceTest#testSubscribe_PollMessageSuccess_WithCustomizedTraceTopic (5%) - client: DefaultMQLitePullConsumerWithTraceTest#testSubscribe_PollMessageSuccess_WithDefaultTraceTopic (6%) - filter: BloomFilterTest#testCheckFalseHit (1%) - tieredstore: IndexStoreServiceTest#queryCrossFileBoundaryTest (35%) - tieredstore: IndexStoreServiceTest#concurrentGetTest (1.5%) Additional changes: - LiteLifecycleManagerTest: Switch to MockitoJUnitRunner.Silent - Add flaky test detection plan docs (CN + EN) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7e5d22d commit ef7327d

9 files changed

Lines changed: 128 additions & 1 deletion

File tree

broker/src/test/java/org/apache/rocketmq/broker/lite/LiteLifecycleManagerTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
import static org.mockito.ArgumentMatchers.eq;
5454
import static org.mockito.Mockito.when;
5555

56-
@RunWith(MockitoJUnitRunner.class)
56+
@RunWith(MockitoJUnitRunner.Silent.class)
5757
public class LiteLifecycleManagerTest {
5858

5959
private final static BrokerConfig BROKER_CONFIG = new BrokerConfig();
@@ -177,6 +177,7 @@ public void testCleanExpiredLiteTopic() {
177177
}
178178
}
179179

180+
@Ignore("Flaky: fails 2/100 runs (2.0%)")
180181
@Test
181182
public void testCleanByParentTopic() {
182183
int num = 3;

broker/src/test/java/org/apache/rocketmq/broker/pop/orderly/ConsumerOrderInfoManagerLockFreeNotifyTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import static org.mockito.Mockito.doAnswer;
3636
import static org.mockito.Mockito.mock;
3737
import static org.mockito.Mockito.when;
38+
import org.junit.Ignore;
3839

3940
public class ConsumerOrderInfoManagerLockFreeNotifyTest {
4041

@@ -156,6 +157,7 @@ public void testConsumeTheChangeInvisibleShorter() {
156157
assertTrue(consumerOrderInfoManager.getConsumerOrderInfoLockManager().getTimeoutMap().isEmpty());
157158
}
158159

160+
@Ignore("Flaky: fails 2/100 runs (2.0%)")
159161
@Test
160162
public void testRecover() {
161163
QueueLevelConsumerManager savedConsumerOrderInfoManager = new QueueLevelConsumerManager();

broker/src/test/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImplTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import static org.mockito.Mockito.timeout;
6262
import static org.mockito.Mockito.verify;
6363
import static org.mockito.Mockito.when;
64+
import org.junit.Ignore;
6465

6566
@RunWith(MockitoJUnitRunner.class)
6667
public class TransactionalMessageServiceImplTest {
@@ -162,6 +163,7 @@ public void testDeletePrepareMessage_queueFull() throws InterruptedException {
162163
assertThat(res).isFalse();
163164
}
164165

166+
@Ignore("Flaky: fails 1/100 runs (1.0%)")
165167
@Test
166168
public void testDeletePrepareMessage_maxSize() throws InterruptedException {
167169
brokerController.getBrokerConfig().setTransactionOpMsgMaxSize(1);

client/src/test/java/org/apache/rocketmq/client/trace/DefaultMQConsumerWithTraceTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
import static org.mockito.Mockito.doReturn;
8989
import static org.mockito.Mockito.spy;
9090
import static org.mockito.Mockito.when;
91+
import org.junit.Ignore;
9192

9293
@RunWith(MockitoJUnitRunner.Silent.class)
9394
public class DefaultMQConsumerWithTraceTest {
@@ -216,6 +217,7 @@ public void terminate() {
216217
pushConsumer.shutdown();
217218
}
218219

220+
@Ignore("Flaky: fails 1/100 runs (1.0%)")
219221
@Test
220222
public void testPullMessage_WithTrace_Success() throws InterruptedException, RemotingException, MQBrokerException, MQClientException {
221223
traceProducer.getDefaultMQProducerImpl().getMqClientFactory().registerProducer(producerGroupTraceTemp, traceProducer.getDefaultMQProducerImpl());

client/src/test/java/org/apache/rocketmq/client/trace/DefaultMQLitePullConsumerWithTraceTest.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
import static org.mockito.Mockito.lenient;
8181
import static org.mockito.Mockito.spy;
8282
import static org.mockito.Mockito.when;
83+
import org.junit.Ignore;
8384

8485
@RunWith(MockitoJUnitRunner.class)
8586
public class DefaultMQLitePullConsumerWithTraceTest {
@@ -153,6 +154,7 @@ public void destroy() {
153154
}
154155
}
155156

157+
@Ignore("Flaky: fails 6/100 runs (6.0%)")
156158
@Test
157159
public void testSubscribe_PollMessageSuccess_WithDefaultTraceTopic() throws Exception {
158160
DefaultLitePullConsumer litePullConsumer = createLitePullConsumerWithDefaultTraceTopic();
@@ -169,6 +171,7 @@ public void testSubscribe_PollMessageSuccess_WithDefaultTraceTopic() throws Exce
169171
}
170172
}
171173

174+
@Ignore("Flaky: fails 5/100 runs (5.0%)")
172175
@Test
173176
public void testSubscribe_PollMessageSuccess_WithCustomizedTraceTopic() throws Exception {
174177
DefaultLitePullConsumer litePullConsumer = createLitePullConsumerWithCustomizedTraceTopic();
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# RocketMQ Flaky Test 检测方案
2+
3+
## 背景与目标
4+
5+
RocketMQ 主干 CI 经常出现间歇性测试失败,导致开发者对红色构建产生信任疲劳,真正的回归问题容易被掩盖。本方案通过大规模重复执行统计失败率,对不稳定方法(≥1%)标记 `@Ignore` 恢复 CI 可靠性,同时保留数据为后续修复提供优先级依据。
6+
7+
## 方法论来源
8+
9+
- **Google**[Flaky Tests at Google and How We Mitigate Them](https://testing.googleblog.com/2016/05/flaky-tests-at-google-and-how-we.html)(2016):提出 "deflake"(重复执行 N 次统计失败率)和 "quarantine"(将不稳定测试从主线 CI 隔离)。内部数据:约 1.5% 的测试存在 flakiness,16% 曾经 flaky 过。
10+
- **Meta**[Predictive Test Selection](https://engineering.fb.com/2018/11/21/developer-tools/predictive-test-selection/)(2018):通过 aggressive retry 区分 flaky 失败与真实回归。
11+
- **Spotify**[Test Flakiness Methods](https://engineering.atspotify.com/2019/11/test-flakiness-methods-for-identifying-and-dealing-with-flaky-tests/)(2019):重复执行 + 隔离 + 追踪的三阶段治理框架。
12+
13+
## 核心思路:三层漏斗
14+
15+
采用"粗筛 → 精筛 → 定位"逐步缩小范围,避免在全量方法级别浪费算力:
16+
17+
```
18+
第一层:模块级(16 模块 × 100 次)→ 筛出有失败的模块
19+
第二层:类级(仅不稳定模块中的测试类 × 100 次)→ 筛出有失败的类
20+
第三层:方法级(仅不稳定类中的测试方法 × 100 次)→ 精确定位每个方法的失败率
21+
```
22+
23+
每层执行完后分析 Surefire XML 报告,输出不稳定列表作为下一层的输入。标记后重新全量执行验证,如仍有新 flaky 出现则循环标记 + 验证,直到零失败。
24+
25+
## 执行架构
26+
27+
- **控制节点(本地)**:编排任务分发、结果收集、数据分析
28+
- **工作节点(10 台 ECS,16C 64G)**:每台最多 4 个 Docker 容器并行执行测试,互不干扰
29+
30+
## 执行流程
31+
32+
```
33+
1. 构建 → Docker 内 JDK 8 编译 RocketMQ,打包为测试镜像
34+
2. 分发 → 内网中转分发镜像到所有工作节点
35+
3. 派发 → 生成任务列表,均匀拆分到各节点,启动 worker
36+
4. 收集 → 轮询等待完成,回收 Surefire XML 报告
37+
5. 分析 → 解析 XML,统计失败次数和失败率
38+
6. 标记 → 对超过阈值的方法添加 @Ignore
39+
7. 验证 → 重新构建并全量执行,确认主干稳定
40+
```
41+
42+
## 关键设计决策
43+
44+
| 决策点 | 选择 | 原因 |
45+
|--------|------|------|
46+
| 编译环境 | Docker 内 JDK 8 | 本地 JDK 版本不一致,容器内保证一致性 |
47+
| 镜像分发 | 先传一台再内网中转 | 内网带宽远大于公网 |
48+
| 测试隔离 | 每轮独立容器 | 避免进程残留、端口占用等干扰 |
49+
| 失败判定 | ≥1% 失败率 | 1000 次有效执行下 1% 约 10 次失败,平衡误判与漏判 |
50+
| 标记方式 | `@Ignore` + 失败率注释 | 最小侵入,方便后续逐个启用 |
51+
| 验证循环 | 标记后重新全量跑 | 处理"隐藏 flaky"问题 |
52+
53+
## 后续计划
54+
55+
- 对高失败率方法(>10%)优先根因分析并修复,修复后移除 `@Ignore` 并重新验证
56+
- 考虑将检测工具集成到定期 CI 任务中,持续监控测试稳定性
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# RocketMQ Flaky Test Detection Plan
2+
3+
## Background & Goals
4+
5+
RocketMQ's mainline CI frequently experiences intermittent test failures, causing developer trust fatigue toward red builds and masking real regressions. This plan uses large-scale repeated execution to statistically measure failure rates, marks unstable methods (≥1%) with `@Ignore` to restore CI reliability, and retains the data to prioritize subsequent fixes.
6+
7+
## Methodology References
8+
9+
- **Google**[Flaky Tests at Google and How We Mitigate Them](https://testing.googleblog.com/2016/05/flaky-tests-at-google-and-how-we.html) (2016): Introduced "deflake" (run N times to measure failure rate) and "quarantine" (isolate flaky tests from mainline CI). Internal data: ~1.5% of tests are flaky, 16% have been flaky at some point.
10+
- **Meta**[Predictive Test Selection](https://engineering.fb.com/2018/11/21/developer-tools/predictive-test-selection/) (2018): Uses aggressive retry to separate flaky failures from real regressions.
11+
- **Spotify**[Test Flakiness Methods](https://engineering.atspotify.com/2019/11/test-flakiness-methods-for-identifying-and-dealing-with-flaky-tests/) (2019): Three-stage framework of repeated execution + isolation + tracking.
12+
13+
## Core Idea: Three-Layer Funnel
14+
15+
A "coarse → fine → pinpoint" strategy to progressively narrow scope and avoid wasting compute at the full method level:
16+
17+
```
18+
Layer 1: Module level (16 modules × 100 runs) → filter out modules with failures
19+
Layer 2: Class level (only classes in unstable modules × 100 runs) → filter out classes with failures
20+
Layer 3: Method level (only methods in unstable classes × 100 runs) → precisely locate each method's failure rate
21+
```
22+
23+
After each layer, Surefire XML reports are analyzed and the unstable list feeds the next layer. After marking, a full re-run verifies stability; if new flaky tests surface, the mark + verify cycle repeats until zero failures.
24+
25+
## Execution Architecture
26+
27+
- **Control node (local)**: Orchestrates task distribution, result collection, data analysis
28+
- **Worker nodes (10 ECS, 16C 64G each)**: Max 4 Docker containers per node in parallel, each test run isolated
29+
30+
## Execution Flow
31+
32+
```
33+
1. Build → Compile RocketMQ with JDK 8 inside Docker, package as test image
34+
2. Distribute → Relay image via internal network to all worker nodes
35+
3. Dispatch → Generate task list, split evenly across nodes, start workers
36+
4. Collect → Poll until complete, retrieve Surefire XML reports
37+
5. Analyze → Parse XML, compute failure count and rate per method
38+
6. Mark → Add @Ignore to methods exceeding threshold
39+
7. Verify → Rebuild and run full suite to confirm trunk stability
40+
```
41+
42+
## Key Design Decisions
43+
44+
| Decision Point | Choice | Rationale |
45+
|---------------|--------|-----------|
46+
| Build environment | JDK 8 inside Docker | Local JDK versions vary; container ensures consistency |
47+
| Image distribution | Upload to one node, relay via internal network | Internal bandwidth far exceeds public internet |
48+
| Test isolation | Independent container per run | Avoids residual processes, port conflicts |
49+
| Failure threshold | ≥1% failure rate | ~10 failures across 1000 effective runs; balances false positives vs. missed cases |
50+
| Marking approach | `@Ignore` + failure rate comment | Minimal intrusion, easy to re-enable later |
51+
| Verification loop | Full re-run after marking | Handles "hidden flaky" problem |
52+
53+
## Follow-up Plan
54+
55+
- Prioritize root-cause analysis and fix for high failure rate methods (>10%); remove `@Ignore` and re-verify after fix
56+
- Consider integrating the detection tool into periodic CI tasks for continuous stability monitoring

filter/src/test/java/org/apache/rocketmq/filter/BloomFilterTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.util.Random;
2626

2727
import static org.assertj.core.api.Assertions.assertThat;
28+
import org.junit.Ignore;
2829

2930
public class BloomFilterTest {
3031

@@ -142,6 +143,7 @@ public void testBloomFilterData() {
142143
assertThat(bloomFilter.isValid(bloomFilterData)).isFalse();
143144
}
144145

146+
@Ignore("Flaky: fails 1/100 runs (1.0%)")
145147
@Test
146148
public void testCheckFalseHit() {
147149
BloomFilter bloomFilter = BloomFilter.createByFn(1, 300);

tieredstore/src/test/java/org/apache/rocketmq/tieredstore/index/IndexStoreServiceTest.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
import org.slf4j.LoggerFactory;
5454

5555
import static org.awaitility.Awaitility.await;
56+
import org.junit.Ignore;
5657

5758
public class IndexStoreServiceTest {
5859

@@ -308,6 +309,7 @@ public void queryFromFileTest() throws InterruptedException, ExecutionException
308309
}
309310
}
310311

312+
@Ignore("Flaky: fails 1/65 runs (1.5%)")
311313
@Test
312314
public void concurrentGetTest() throws InterruptedException {
313315
storeConfig.setTieredStoreIndexFileMaxIndexNum(2000);
@@ -352,6 +354,7 @@ public void concurrentGetTest() throws InterruptedException {
352354
Assert.assertTrue(result.get());
353355
}
354356

357+
@Ignore("Flaky: fails 35/100 runs (35.0%)")
355358
@Test
356359
public void queryCrossFileBoundaryTest() throws InterruptedException, ExecutionException {
357360
indexService = new IndexStoreService(fileAllocator, filePath);

0 commit comments

Comments
 (0)