Skip to content

Commit f7563a1

Browse files
authored
Add support for handling kernel-level race condition on process finalization that causes error when attempting to access exit code. (#606)
Fix confirmed in repro of partner team scenario.
1 parent 57fcce3 commit f7563a1

11 files changed

Lines changed: 582 additions & 14 deletions

File tree

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.1.48
1+
2.1.49

src/VirtualClient/VirtualClient.Actions/Network/NetworkingWorkload/CPS/CPSExecutor.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ await this.ProcessStartRetryPolicy.ExecuteAsync(async () =>
274274
try
275275
{
276276
this.CleanupTasks.Add(() => process.SafeKill(this.Logger));
277-
await process.StartAndWaitAsync(cancellationToken, timeout);
277+
await process.StartAndWaitAsync(cancellationToken, timeout, withExitConfirmation: true);
278278
await this.LogProcessDetailsAsync(process, relatedContext, "CPS");
279279
process.ThrowIfWorkloadFailed();
280280

src/VirtualClient/VirtualClient.Actions/Network/NetworkingWorkload/Latte/LatteClientExecutor.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ await this.ProcessStartRetryPolicy.ExecuteAsync(async () =>
6060
try
6161
{
6262
this.CleanupTasks.Add(() => process.SafeKill(this.Logger));
63-
await process.StartAndWaitAsync(cancellationToken, timeout);
63+
await process.StartAndWaitAsync(cancellationToken, timeout, withExitConfirmation: true);
6464

6565
if (!cancellationToken.IsCancellationRequested)
6666
{

src/VirtualClient/VirtualClient.Actions/Network/NetworkingWorkload/NTttcp/NTttcpExecutor.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ await this.ProcessStartRetryPolicy.ExecuteAsync(async () =>
330330
try
331331
{
332332
this.CleanupTasks.Add(() => process.SafeKill(this.Logger));
333-
await process.StartAndWaitAsync(cancellationToken, timeout);
333+
await process.StartAndWaitAsync(cancellationToken, timeout, withExitConfirmation: true);
334334

335335
if (process.IsErrored())
336336
{

src/VirtualClient/VirtualClient.Actions/Network/NetworkingWorkload/SockPerf/SockPerfClientExecutor.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ await this.ProcessStartRetryPolicy.ExecuteAsync(async () =>
6363
try
6464
{
6565
this.CleanupTasks.Add(() => process.SafeKill(this.Logger));
66-
await process.StartAndWaitAsync(cancellationToken, timeout);
66+
await process.StartAndWaitAsync(cancellationToken, timeout, withExitConfirmation: true);
6767

6868
if (!cancellationToken.IsCancellationRequested)
6969
{

src/VirtualClient/VirtualClient.Common.UnitTests/ProcessExtensionsTests.cs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ namespace VirtualClient.Common
99
using System.Linq;
1010
using System.Threading;
1111
using System.Threading.Tasks;
12+
using Moq;
1213
using NUnit.Framework;
1314

1415
[TestFixture]
@@ -41,6 +42,64 @@ public void InteractiveExtensionSetsUpTheExpectedRequirementsForInterfacingWithT
4142
Assert.IsTrue(process.RedirectStandardOutput);
4243
}
4344

45+
[Test]
46+
public void ToProcessDetailsReturnsTheExpectedProcessInformation()
47+
{
48+
this.mockProcess.StandardOutput.Clear();
49+
this.mockProcess.StandardOutput.Append(Guid.NewGuid().ToString());
50+
this.mockProcess.StandardError.Clear();
51+
this.mockProcess.StandardError.Append(Guid.NewGuid().ToString());
52+
this.mockProcess.StartTime = DateTime.UtcNow.AddMinutes(-2);
53+
this.mockProcess.ExitTime = DateTime.UtcNow;
54+
this.mockProcess.StartInfo.WorkingDirectory = Environment.CurrentDirectory;
55+
56+
ProcessDetails processDetails = this.mockProcess.ToProcessDetails("NTttcp");
57+
Assert.IsNotNull(processDetails);
58+
Assert.AreEqual(this.mockProcess.FullCommand(), processDetails.CommandLine);
59+
Assert.AreEqual(this.mockProcess.ExitCode, processDetails.ExitCode);
60+
Assert.AreEqual(this.mockProcess.ExitTime, processDetails.ExitTime);
61+
Assert.AreEqual(this.mockProcess.Id, processDetails.Id);
62+
Assert.AreEqual(this.mockProcess.StandardError.ToString(), processDetails.StandardError);
63+
Assert.AreEqual(this.mockProcess.StandardOutput.ToString(), processDetails.StandardOutput);
64+
Assert.AreEqual(this.mockProcess.StartInfo.WorkingDirectory, processDetails.WorkingDirectory);
65+
}
66+
67+
[Test]
68+
public void ToProcessDetailsHandlesRaceConditionCasesWhereTheProcessExitCodeIsUnavailable()
69+
{
70+
// There is a race condition-style flaw in the .NET implementation of the
71+
// WaitForExit() method. The race condition allows for the process to exit after
72+
// completion but for a period of time to pass before the kernel completes all finalization
73+
// and cleanup steps (e.g. setting an exit code). To help prevent downstream issues that
74+
// happen when attempting to access properties on the process during this race condition period
75+
// of time, we are adding in an extra check on the process HasExited.
76+
//
77+
// Example of error hit during race condition period of time:
78+
// Process must exit before requested information can be determined.
79+
Mock<IProcessProxy> process = new Mock<IProcessProxy>();
80+
process.Setup(
81+
"/any/command.sh",
82+
"--any-args=true",
83+
"/any/working/dir",
84+
Guid.NewGuid().ToString(),
85+
Guid.NewGuid().ToString(),
86+
1234);
87+
88+
process.Setup(p => p.ExitCode).Throws(new InvalidOperationException("Process must exit before requested information can be determined."));
89+
90+
ProcessDetails processDetails = null;
91+
Assert.DoesNotThrow(() => processDetails = process.Object.ToProcessDetails("NTttcp"));
92+
93+
Assert.IsNotNull(processDetails);
94+
Assert.AreEqual(-1, processDetails.ExitCode);
95+
Assert.AreEqual(process.Object.FullCommand(), processDetails.CommandLine);
96+
Assert.AreEqual(process.Object.ExitTime, processDetails.ExitTime);
97+
Assert.AreEqual(process.Object.Id, processDetails.Id);
98+
Assert.AreEqual(process.Object.StandardError.ToString(), processDetails.StandardError);
99+
Assert.AreEqual(process.Object.StandardOutput.ToString(), processDetails.StandardOutput);
100+
Assert.AreEqual(process.Object.StartInfo.WorkingDirectory, processDetails.WorkingDirectory);
101+
}
102+
44103
[Test]
45104
public void WaitForResponseAsyncExtensionExitsAsSoonAsTheExpectedResponseIsReceived()
46105
{

src/VirtualClient/VirtualClient.Common/ProcessExtensions.cs

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ namespace VirtualClient.Common
55
{
66
using System;
77
using System.Collections.Generic;
8+
using System.IO;
89
using System.Text.RegularExpressions;
910
using System.Threading;
1011
using System.Threading.Tasks;
@@ -36,13 +37,69 @@ public static IProcessProxy Interactive(this IProcessProxy process)
3637
/// An absolute timeout to apply for the case that the process does not finish in the amount of time expected. If the
3738
/// timeout is reached a <see cref="TimeoutException"/> exception will be thrown.
3839
/// </param>
39-
public static async Task StartAndWaitAsync(this IProcessProxy process, CancellationToken cancellationToken, TimeSpan? timeout = null)
40+
/// <param name="withExitConfirmation">True to confirm an exit code before returning. Default = false.</param>
41+
public static async Task StartAndWaitAsync(this IProcessProxy process, CancellationToken cancellationToken, TimeSpan? timeout = null, bool withExitConfirmation = false)
4042
{
4143
process.ThrowIfNull(nameof(process));
4244

4345
if (process.Start())
4446
{
4547
await process.WaitForExitAsync(cancellationToken, timeout);
48+
49+
if (withExitConfirmation)
50+
{
51+
// There is a race condition-style flaw in the .NET implementation of the
52+
// WaitForExit() method. The race condition allows for the process to exit after
53+
// completion but for a period of time to pass before the kernel completes all finalization
54+
// and cleanup steps (e.g. setting an exit code). To help prevent downstream issues that
55+
// happen when attempting to access properties on the process during this race condition period
56+
// of time, we are adding in an extra check on the process HasExited.
57+
//
58+
// Example of error hit during race condition period of time:
59+
// Process must exit before requested information can be determined.
60+
DateTime exitTime = DateTime.UtcNow.AddMinutes(2);
61+
int exitCode = -1;
62+
bool confirmed = false;
63+
64+
while (DateTime.UtcNow < exitTime)
65+
{
66+
try
67+
{
68+
// If the exit code is not available, this line will throw an exception.
69+
exitCode = process.ExitCode;
70+
confirmed = true;
71+
break;
72+
}
73+
catch
74+
{
75+
// Wait, but don't throttle the CPU.
76+
await Task.Delay(1000);
77+
}
78+
}
79+
80+
if (!confirmed)
81+
{
82+
try
83+
{
84+
string processName = null;
85+
ProcessExtensions.TryGetValue<string>(
86+
() =>
87+
{
88+
return $"{Path.GetFileName(process?.StartInfo.FileName)} {process?.StartInfo.Arguments}"?.Trim();
89+
},
90+
out processName);
91+
92+
int processId = -1;
93+
ProcessExtensions.TryGetValue<int>(() => process.Id, out processId);
94+
95+
Console.Error.WriteLine($"Process exit confirmation failed for process '{processName} (id={processId})'.");
96+
}
97+
catch
98+
{
99+
// Do not allow any exceptions to surface from here.
100+
}
101+
}
102+
}
46103
}
47104
}
48105

@@ -59,11 +116,33 @@ public static ProcessDetails ToProcessDetails(this IProcessProxy process, string
59116
{
60117
process.ThrowIfNull(nameof(process));
61118

119+
int processId = -1;
120+
int exitCode = -1;
121+
122+
try
123+
{
124+
if (ProcessExtensions.TryGetValue<int>(() => process.Id, out int id))
125+
{
126+
processId = id;
127+
}
128+
129+
if (ProcessExtensions.TryGetValue<int>(() => process.ExitCode, out int code))
130+
{
131+
exitCode = code;
132+
}
133+
}
134+
catch
135+
{
136+
// Avoid exceptions caused by kernel-layer race conditions on process finalization.
137+
// e.g.
138+
// Process must exit before requested information can be determined.
139+
}
140+
62141
return new ProcessDetails
63142
{
64-
Id = process.Id,
143+
Id = processId,
65144
CommandLine = SensitiveData.ObscureSecrets($"{process.StartInfo?.FileName} {process.StartInfo?.Arguments}".Trim()),
66-
ExitCode = process.ExitCode,
145+
ExitCode = exitCode,
67146
Results = results,
68147
StandardError = process.StandardError?.Length > 0 ? process.StandardError.ToString() : string.Empty,
69148
StandardOutput = process.StandardOutput?.Length > 0 ? process.StandardOutput.ToString() : string.Empty,
@@ -142,5 +221,30 @@ public static Task<IProcessProxy> WaitForResponseAsync(
142221

143222
return process.WaitForResponseAsync(new Regex(response, comparisonOptions), cancellationToken, timeout);
144223
}
224+
225+
/// <summary>
226+
/// Returns true/false whether the value can be derived.
227+
/// </summary>
228+
/// <typeparam name="T">The data type of the value.</typeparam>
229+
/// <param name="reader">A function/delegate used to retrieve the value.</param>
230+
/// <param name="value">The value if existing.</param>
231+
/// <returns>True if the value can be confirmed to exist and is non-null.</returns>
232+
private static bool TryGetValue<T>(Func<T> reader, out T value)
233+
where T : IConvertible
234+
{
235+
bool confirmed = false;
236+
value = default(T);
237+
238+
try
239+
{
240+
value = reader.Invoke();
241+
confirmed = true;
242+
}
243+
catch
244+
{
245+
}
246+
247+
return confirmed;
248+
}
145249
}
146250
}

src/VirtualClient/VirtualClient.Common/ProcessProxy.cs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ namespace VirtualClient.Common
88
using System.Collections.Specialized;
99
using System.Diagnostics;
1010
using System.IO;
11-
using System.Linq;
12-
using System.Text.RegularExpressions;
1311
using System.Threading;
1412
using System.Threading.Tasks;
1513
using VirtualClient.Common.Extensions;
@@ -338,13 +336,11 @@ public virtual async Task WaitForExitAsync(CancellationToken cancellationToken,
338336
{
339337
if (timeout == null)
340338
{
341-
await this.UnderlyingProcess.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
342-
this.exitTime = DateTime.UtcNow;
339+
await this.UnderlyingProcess.WaitForExitAsync(cancellationToken);
343340
}
344341
else
345342
{
346-
await this.UnderlyingProcess.WaitForExitAsync(cancellationToken).WaitAsync(timeout.Value).ConfigureAwait(false);
347-
this.exitTime = DateTime.UtcNow;
343+
await this.UnderlyingProcess.WaitForExitAsync(cancellationToken).WaitAsync(timeout.Value);
348344
}
349345
}
350346
catch (OperationCanceledException)

0 commit comments

Comments
 (0)