-
Notifications
You must be signed in to change notification settings - Fork 203
Expand file tree
/
Copy pathConcurrentModificationExceptionTest.java
More file actions
430 lines (387 loc) · 19.1 KB
/
ConcurrentModificationExceptionTest.java
File metadata and controls
430 lines (387 loc) · 19.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
package org.jenkins.plugins.lockableresources;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import hudson.Functions;
import java.util.ArrayList;
import java.util.ConcurrentModificationException;
import java.util.List;
import java.util.Random;
import java.util.Timer;
import java.util.TimerTask;
import java.util.logging.Handler;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
import org.jenkins.plugins.lockableresources.util.Constants;
import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
import org.junit.jupiter.api.extension.ExtendWith;
import org.jvnet.hudson.test.Issue;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.junit.jupiter.WithJenkins;
import uk.org.webcompere.systemstubs.jupiter.SystemStub;
import uk.org.webcompere.systemstubs.jupiter.SystemStubsExtension;
import uk.org.webcompere.systemstubs.stream.SystemErr;
import uk.org.webcompere.systemstubs.stream.output.MultiplexOutput;
import uk.org.webcompere.systemstubs.stream.output.Output;
import uk.org.webcompere.systemstubs.stream.output.TapStream;
@ExtendWith(SystemStubsExtension.class)
@WithJenkins
class ConcurrentModificationExceptionTest {
private static final Logger LOGGER = Logger.getLogger(ConcurrentModificationExceptionTest.class.getName());
// Prepare to capture CME clues in JVM or Jenkins instance
// logs (stderr -- sometimes the problem is reported there,
// but does not cause a crash for any of the runs). This
// apparently must be a class-wide rule (so it can tap into
// the stream before Jenkins starts). For more details,
// please see:
// * https://github.com/webcompere/system-stubs
// * https://www.baeldung.com/java-system-stubs
@SystemStub
private SystemErr systemErrTap = new SystemErr(new MultiplexOutput(new TapStream(), Output.fromStream(System.err)));
@Test
void parallelTasksTest(JenkinsRule j) throws Exception {
final int agentsCount = Functions.isWindows() ? 5 : 10;
final int extraAgentsCount = Functions.isWindows() ? 5 : 20;
final int resourcesCount = 100;
final int extraResourcesCount = 100;
// disable save. Everything is saved into filesystem, and it takes a while
// normally it is no problem, but we need to start many tasks parallel
System.setProperty(Constants.SYSTEM_PROPERTY_DISABLE_SAVE, "true");
// Do not mirror nodes now. We will allow it later in parallel tasks
System.setProperty(Constants.SYSTEM_PROPERTY_ENABLE_NODE_MIRROR, "false");
LOGGER.info("add agents");
for (int i = 1; i <= agentsCount; i++) j.createSlave("Agent_" + i, "label label2 agent", null);
LOGGER.info("add agents done");
LockableResourcesManager LRM = LockableResourcesManager.get();
LOGGER.info("add resources");
for (int i = 1; i <= resourcesCount; i++) LRM.createResourceWithLabel("resource_" + i, "label label2");
LOGGER.info("add resources done");
TimerTask taskBackwardCompatibility = new TimerTask() {
public void run() {
LOGGER.info("run BackwardCompatibility");
BackwardCompatibility.compatibilityMigration();
LOGGER.info("BackwardCompatibility done");
}
};
TimerTask taskFreeDeadJobs = new TimerTask() {
public void run() {
LOGGER.info("run FreeDeadJobs");
FreeDeadJobs.freePostMortemResources();
LOGGER.info("FreeDeadJobs done");
}
};
TimerTask taskNodesMirror = new TimerTask() {
public void run() {
System.setProperty(Constants.SYSTEM_PROPERTY_ENABLE_NODE_MIRROR, "true");
LOGGER.info("run NodesMirror");
org.jenkins.plugins.lockableresources.nodes.NodesMirror.createNodeResources();
LOGGER.info("NodesMirror done");
}
};
TimerTask taskCreateAgents = new TimerTask() {
public void run() {
LOGGER.info("create extra slaves");
for (int i = 1; i <= extraAgentsCount; i++) {
try {
j.createSlave("ExtraAgent_" + i, "label label2 extra-agent", null);
} catch (Exception error) {
LOGGER.warning(error.toString());
}
}
LOGGER.info("create extra slaves done");
}
};
TimerTask taskCreateResources = new TimerTask() {
public void run() {
LOGGER.info("create extra resources");
for (int i = 1; i <= extraResourcesCount; i++) {
try {
if (LockableResourcesManager.get()
.createResourceWithLabel("ExtraResource_" + i, "label label2"))
LOGGER.info("ExtraResource_" + i + " added");
assertNotNull(LockableResourcesManager.get().fromName("ExtraResource_" + i));
Thread.sleep(10);
} catch (Exception error) {
LOGGER.warning(error.toString());
}
}
LOGGER.info("create extra resources done");
}
};
long delay = 10L;
Timer timerCreateAgents = new Timer("CreateAgents");
timerCreateAgents.schedule(taskCreateAgents, delay);
Timer timerCreateResources = new Timer("CreateResources");
timerCreateResources.schedule(taskCreateResources, ++delay);
Timer timerBackwardCompatibility = new Timer("BackwardCompatibility");
timerBackwardCompatibility.schedule(taskBackwardCompatibility, ++delay);
Timer timerFreeDeadJobs = new Timer("FreeDeadJobs");
timerFreeDeadJobs.schedule(taskFreeDeadJobs, ++delay);
Timer timerNodesMirror = new Timer("NodesMirror");
timerNodesMirror.schedule(taskNodesMirror, ++delay);
for (int i = 1; i <= 100; i++) {
Thread.sleep(500);
LOGGER.info("wait for resources " + i + " "
+ " extra-agent: "
+ LRM.getResourcesWithLabel("extra-agent").size() + " == " + extraAgentsCount);
if (LRM.getResourcesWithLabel("extra-agent").size() == extraAgentsCount) break;
}
for (int i = 1; i <= extraAgentsCount; i++) {
try {
assertNotNull(LockableResourcesManager.get().fromName("ExtraAgent_" + i));
j.jenkins.removeNode(j.jenkins.getNode("ExtraAgent_" + i));
} catch (Exception error) {
LOGGER.warning(error.toString());
}
}
// all the tasks are asynchronous operations, so wait until resources are created.
LOGGER.info("wait for resources");
for (int i = 1; i <= 100; i++) {
Thread.sleep(500);
LOGGER.info("wait for resources " + i + " "
+ LRM.resourceExist("ExtraResource_" + extraResourcesCount)
+ " extra-agent: "
+ LRM.getResourcesWithLabel("extra-agent").size() + " == 0 "
+ " agent: " + LRM.getResourcesWithLabel("agent").size());
if (LRM.resourceExist("ExtraResource_" + extraResourcesCount)
&& LRM.getResourcesWithLabel("extra-agent").isEmpty()
&& LRM.getResourcesWithLabel("agent").size() == agentsCount) break;
}
// normally it is bad idea to make so much assertions, but we need verify if all works fine
for (int i = 1; i <= resourcesCount; i++) {
assertNotNull(LockableResourcesManager.get().fromName("resource_" + i));
}
for (int i = 1; i <= extraResourcesCount; i++) {
assertNotNull(LockableResourcesManager.get().fromName("ExtraResource_" + i));
}
for (int i = 1; i <= extraAgentsCount; i++) {
assertNull(LockableResourcesManager.get().fromName("ExtraAgent_" + i));
}
for (int i = 1; i <= agentsCount; i++) {
assertNotNull(LockableResourcesManager.get().fromName("Agent_" + i));
}
}
/**
* Various events in Jenkins can cause saving of Workflow state,
* even if particular jobs are configured for "performance" mode
* to avoid causing this themselves in transitions between steps.
* This causes an XStream export of Java objects, which may crash
* with a {@link ConcurrentModificationException} if certain
* complex properties of Jenkins Actions are being modified at
* the same time (e.g. LR log list is updated here):
* <pre>
* java.util.ConcurrentModificationException
* ...
* Caused: java.lang.RuntimeException: Failed to serialize
* org.jenkins.plugins.lockableresources.actions.LockedResourcesBuildAction#logs
* for class org.jenkins.plugins.lockableresources.actions.LockedResourcesBuildAction
* ...
* Also: org.jenkinsci.plugins.workflow.actions.ErrorAction$ErrorId: {UUID}
* Caused: java.lang.RuntimeException: Failed to serialize hudson.model.Actionable#actions
* for class org.jenkinsci.plugins.workflow.job.WorkflowRun
* ...
* </pre>
*
* This test aims to reproduce the issue, and eventually confirm
* a fix and non-regression.<br/>
*
* Alas, "reliably catching a non-deterministic race condition"
* is maybe an oxymoron in itself, so we try to do our best here.
*
* @throws Exception If test failed
*/
@Test
@Timeout(900)
@Issue("JENKINS-76294")
void noCmeWhileSavingXStreamVsLockedResourcesBuildAction(JenkinsRule j) throws Exception {
// How many parallel stages would we use before saving WorkflowRun
// state inside the pipeline run, and overall?
// The preflood also defines how many lockable resources we use
// (half persistent, half ephemeral).
int preflood = 25, maxflood = 75;
// More workers to increase the chaos in competition for resources;
// this number should exceed maxRuns (agents are dedicated to a job
// so they can all run simultaneously and not wait for executors).
int extraAgents = 16;
// How many jobs run in parallel?
// Note that along with the amount of agents and maxflood
// this dictates how long the test runs.
int maxRuns = 3;
List<WorkflowJob> wfJobs = new ArrayList<>();
List<WorkflowRun> wfRuns = new ArrayList<>();
LockableResourcesManager lrm = LockableResourcesManager.get();
// Substrings we would seek in logs to say the problem happened:
List<String> indicatorsCME = new ArrayList<>();
indicatorsCME.add("Failed to serialize");
indicatorsCME.add("java.util.ConcurrentModificationException");
// Prepare to capture CME clues in JVM or Jenkins instance logs
// (sometimes the problem is reported there, but does not cause
// a crash for any of the runs).
Logger capturingLogger = Logger.getLogger(""); // root, or specific package/logger
StringBuilder capturedLogs = new StringBuilder();
Handler capturingLogHandler = new Handler() {
@Override
public void publish(LogRecord record) {
capturedLogs
.append(record.getLevel())
.append(": ")
.append(record.getMessage())
.append('\n');
}
@Override
public void flush() {}
@Override
public void close() throws SecurityException {}
};
// Involve also the lag and race of remote executions
LOGGER.info("create extra build agents");
for (int i = 1; i <= extraAgents; i++) {
try {
// Match JOB_NAME like "test1"
j.createSlave("ExtraAgent_" + i, "worker-test" + (i % maxRuns), null);
} catch (Exception error) {
LOGGER.warning(error.toString());
}
}
LOGGER.info("create extra build agents done");
LOGGER.info("define " + (preflood / 2) + " persistent lockable resources");
for (int i = 1; i <= preflood / 2; i++) {
lrm.createResource("lock-" + i);
}
LOGGER.info("define " + maxRuns + " test workflows");
String pipeCode = "import java.lang.Math;\n" + "import java.util.Random;\n"
+
// Do not occupy all readers at first, so all our
// jobs can get defined and started simultaneously
// (avoid them "waiting for available executors")
"lock('first') { sleep 3 }\n"
+ "def parstages = [:]\n"
+
// flood with lock actions, including logging about them
"def preflood = "
+ preflood + "\n" + "def maxflood = "
+ maxflood + "\n" + "for (int i = 1; i < preflood; i++) {\n"
+
// Note that we must use toString() and explicit vars to
// avoid seeing same values at time of GString evaluation
" String iStr = String.valueOf(i)\n"
+ " parstages[\"stage-${iStr}\".toString()] = {\n"
+ " node(label: 'worker-' + env.JOB_NAME) {\n"
+ " lock(\"lock-${iStr}\".toString()) {\n"
+ " sleep 1\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n"
+
// force a save while mutations are happening
"parstages['saver'] = {\n"
+ " org.jenkinsci.plugins.workflow.job.WorkflowRun r = currentBuild.rawBuild\n"
+ " r.save()\n"
+ "}\n"
+
// sandwiching makes it more likely to get the race condition
// as someone works with locks while XStream kicks in
// Also make it actually wait for some (ephemeral) locks
"for (int i = preflood; i < maxflood; i++) {\n"
+ " String iStr = String.valueOf(i)\n"
+ " String iStrLock = String.valueOf((i % preflood) + 1)\n"
+ " String iStrName = \"Build ${JOB_NAME} #${BUILD_ID} in parstage ${iStr} for lock ${iStrLock}\".toString()\n"
+ " parstages[\"stage-${iStr}\".toString()] = {\n"
+ " node(label: 'worker-' + env.JOB_NAME) {\n"
+ " lock(\"lock-${iStrLock}\".toString()) {\n"
+
// Changes of currentBuild should cause some saves too
// (also badges, SCM steps, etc. - but these would need
// more plugins as dependencies just for the tests):
" echo \"Set currentBuild.displayName = '${iStrName}'\"\n"
+ " currentBuild.displayName = iStrName\n"
+
// Randomize which job/lock combo waits for which,
// so we do not have all builds sequentially completing:
" sleep (time: 500 + Math.abs(new Random().nextInt(1000)), unit: 'MILLISECONDS')\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "parallel parstages\n";
capturingLogger.addHandler(capturingLogHandler);
try {
for (int i = 0; i < maxRuns; i++) {
WorkflowJob p = j.createProject(WorkflowJob.class);
p.setDefinition(new CpsFlowDefinition(pipeCode, false));
wfJobs.add(p);
}
LOGGER.info("Execute test workflows");
for (int i = 0; i < maxRuns; i++) {
WorkflowRun r = wfJobs.get(i).scheduleBuild2(0).waitForStart();
wfRuns.add(r);
}
for (int i = 0; i < maxRuns; i++) {
j.waitForMessage("[Pipeline] parallel", wfRuns.get(i));
}
Jenkins jenkins = Jenkins.get();
// Trigger Jenkins-wide save activities.
// Note: job runs also save workflow for good measure
for (int i = 0; i < 10; i++) {
LOGGER.info("Trigger Jenkins/LR state save (random interval ~3s +- 50ms)");
lrm.save();
// Let the timing be out of sync of ~1s sleeps of the pipelines
Thread.sleep(2950 + Math.abs(new Random().nextInt(100)));
}
// Also save state of whole Jenkins config somehow.
// TOTHINK: Is there more to XStream-able state to save?
for (int i = 0; i < 10; i++) {
LOGGER.info("Trigger LR state save (regular interval ~2.1s)");
lrm.save();
// Let the timing be out of sync of ~1s sleeps of the pipelines
Thread.sleep(2139);
LOGGER.info("Trigger Jenkins state save (regular interval ~3s later)");
jenkins.save();
Thread.sleep(2922);
}
LOGGER.info("Wait for builds to complete");
for (int i = 0; i < maxRuns; i++) {
j.waitForCompletion(wfRuns.get(i));
}
LOGGER.info("All builds completed; will analyze their status and logs below");
} finally {
// Complete this bit of ritual even if test run
// (e.g. build status assertion) throws above
capturingLogger.removeHandler(capturingLogHandler);
}
LOGGER.info("Check statuses of completed builds");
for (int i = 0; i < maxRuns; i++) {
j.assertBuildStatusSuccess(wfRuns.get(i));
}
LOGGER.info("Check build logs that CME related messages are absent");
for (int i = 0; i < maxRuns; i++) {
WorkflowRun r = wfRuns.get(i);
for (String s : indicatorsCME) {
j.assertLogNotContains(s, r);
}
}
// Not printed if assertion above fails:
LOGGER.info("All " + maxRuns + " builds are done successfully and did not report CME");
String stderrLog = systemErrTap.getText();
if (stderrLog != null && !(stderrLog.isBlank())) {
LOGGER.info("Check JVM stderr that CME related messages are absent");
for (String s : indicatorsCME) {
assertFalse(stderrLog.contains(s));
}
} else {
LOGGER.info("We tapped into JVM stderr but this collected nothing");
}
LOGGER.info("Check custom Jenkins logger that CME related messages are absent");
String capturedLog = capturedLogs.toString();
for (String s : indicatorsCME) {
assertFalse(capturedLog.contains(s));
}
LOGGER.info("SUCCESS: Test completed without catching any indicators of ConcurrentModificationException");
}
}