Skip to content

Commit 2e7209c

Browse files
ericstjCopilot
andauthored
Fix process crash when testing Stderr (#1449)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 70442e9 commit 2e7209c

File tree

13 files changed

+254
-148
lines changed

13 files changed

+254
-148
lines changed

src/ModelContextProtocol.Core/Client/StdioClientSessionTransport.cs

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,17 @@ internal sealed class StdioClientSessionTransport : StreamClientSessionTransport
1010
private readonly StdioClientTransportOptions _options;
1111
private readonly Process _process;
1212
private readonly Queue<string> _stderrRollingLog;
13+
private readonly DataReceivedEventHandler _errorHandler;
1314
private int _cleanedUp = 0;
1415
private readonly int? _processId;
1516

16-
public StdioClientSessionTransport(StdioClientTransportOptions options, Process process, string endpointName, Queue<string> stderrRollingLog, ILoggerFactory? loggerFactory) :
17+
public StdioClientSessionTransport(StdioClientTransportOptions options, Process process, string endpointName, Queue<string> stderrRollingLog, DataReceivedEventHandler errorHandler, ILoggerFactory? loggerFactory) :
1718
base(process.StandardInput.BaseStream, process.StandardOutput.BaseStream, encoding: null, endpointName, loggerFactory)
1819
{
1920
_options = options;
2021
_process = process;
2122
_stderrRollingLog = stderrRollingLog;
23+
_errorHandler = errorHandler;
2224
try { _processId = process.Id; } catch { }
2325
}
2426

@@ -33,7 +35,7 @@ public override async Task SendMessageAsync(JsonRpcMessage message, Cancellation
3335
{
3436
// We failed to send due to an I/O error. If the server process has exited, which is then very likely the cause
3537
// for the I/O error, we should throw an exception for that instead.
36-
if (await GetUnexpectedExitExceptionAsync(cancellationToken).ConfigureAwait(false) is Exception processExitException)
38+
if (await GetUnexpectedExitExceptionAsync().ConfigureAwait(false) is Exception processExitException)
3739
{
3840
throw processExitException;
3941
}
@@ -45,15 +47,32 @@ public override async Task SendMessageAsync(JsonRpcMessage message, Cancellation
4547
/// <inheritdoc/>
4648
protected override async ValueTask CleanupAsync(Exception? error = null, CancellationToken cancellationToken = default)
4749
{
48-
// Only clean up once.
50+
// Only run the full stdio cleanup once (handler detach, process kill, etc.).
51+
// If another call is already handling cleanup, cancel the shutdown token
52+
// to unblock it (e.g. if it's stuck in WaitForExitAsync) and let it
53+
// call SetDisconnected with full StdioClientCompletionDetails.
4954
if (Interlocked.Exchange(ref _cleanedUp, 1) != 0)
5055
{
56+
CancelShutdown();
5157
return;
5258
}
5359

5460
// We've not yet forcefully terminated the server. If it's already shut down, something went wrong,
5561
// so create an exception with details about that.
56-
error ??= await GetUnexpectedExitExceptionAsync(cancellationToken).ConfigureAwait(false);
62+
error ??= await GetUnexpectedExitExceptionAsync().ConfigureAwait(false);
63+
64+
// Ensure all pending ErrorDataReceived events are drained before detaching
65+
// the handler. GetUnexpectedExitExceptionAsync does this when HasExited is
66+
// true, but there is a narrow window on Linux where the process has closed
67+
// stdout (causing EOF in ReadMessagesAsync) yet hasn't been fully reaped,
68+
// so HasExited returns false and the drain is skipped. An unconditional
69+
// wait here covers that gap. When the drain already happened above, the
70+
// call returns immediately.
71+
await WaitForProcessExitAsync().ConfigureAwait(false);
72+
73+
// Detach the stderr handler so no further ErrorDataReceived events
74+
// are dispatched during or after process disposal.
75+
_process.ErrorDataReceived -= _errorHandler;
5776

5877
// Terminate the server process (or confirm it already exited), then build
5978
// and publish strongly-typed completion details while the process handle
@@ -78,26 +97,16 @@ protected override async ValueTask CleanupAsync(Exception? error = null, Cancell
7897
await base.CleanupAsync(error, cancellationToken).ConfigureAwait(false);
7998
}
8099

81-
private async ValueTask<Exception?> GetUnexpectedExitExceptionAsync(CancellationToken cancellationToken)
100+
private async ValueTask<Exception?> GetUnexpectedExitExceptionAsync()
82101
{
83102
if (!StdioClientTransport.HasExited(_process))
84103
{
85104
return null;
86105
}
87106

88107
Debug.Assert(StdioClientTransport.HasExited(_process));
89-
try
90-
{
91-
// The process has exited, but we still need to ensure stderr has been flushed.
92-
// WaitForExitAsync only waits for exit; it does not guarantee that all
93-
// ErrorDataReceived events have been dispatched. The synchronous WaitForExit()
94-
// (no arguments) does ensure that, so call it after WaitForExitAsync completes.
95-
#if NET
96-
await _process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
97-
#endif
98-
_process.WaitForExit();
99-
}
100-
catch { }
108+
109+
await WaitForProcessExitAsync().ConfigureAwait(false);
101110

102111
string errorMessage = "MCP server process exited unexpectedly";
103112

@@ -122,6 +131,35 @@ protected override async ValueTask CleanupAsync(Exception? error = null, Cancell
122131
return new IOException(errorMessage);
123132
}
124133

134+
/// <summary>
135+
/// Waits for the process to exit within <see cref="StdioClientTransportOptions.ShutdownTimeout"/>
136+
/// and flushes pending <see cref="Process.ErrorDataReceived"/> events.
137+
/// </summary>
138+
/// <remarks>
139+
/// On .NET, <c>Process.WaitForExitAsync</c> also waits for asynchronous output readers
140+
/// to complete, ensuring all events have been dispatched. On .NET Framework,
141+
/// <see cref="Process.WaitForExit(int)"/> does not guarantee this; the parameterless
142+
/// <see cref="Process.WaitForExit()"/> overload is needed to flush the event queue.
143+
/// This method is idempotent—calling it after the process has already been waited on
144+
/// returns immediately.
145+
/// </remarks>
146+
private async ValueTask WaitForProcessExitAsync()
147+
{
148+
try
149+
{
150+
#if NET
151+
using var timeoutCts = new CancellationTokenSource(_options.ShutdownTimeout);
152+
await _process.WaitForExitAsync(timeoutCts.Token).ConfigureAwait(false);
153+
#else
154+
if (_process.WaitForExit((int)_options.ShutdownTimeout.TotalMilliseconds))
155+
{
156+
_process.WaitForExit();
157+
}
158+
#endif
159+
}
160+
catch { }
161+
}
162+
125163
private StdioClientCompletionDetails BuildCompletionDetails(Exception? error)
126164
{
127165
StdioClientCompletionDetails details = new()

src/ModelContextProtocol.Core/Client/StdioClientTransport.cs

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ public async Task<ITransport> ConnectAsync(CancellationToken cancellationToken =
5959

6060
Process? process = null;
6161
bool processStarted = false;
62+
DataReceivedEventHandler? errorHandler = null;
6263

6364
string command = _options.Command;
6465
IList<string>? arguments = _options.Arguments;
@@ -136,7 +137,7 @@ public async Task<ITransport> ConnectAsync(CancellationToken cancellationToken =
136137
// few lines in a rolling log for use in exceptions.
137138
const int MaxStderrLength = 10; // keep the last 10 lines of stderr
138139
Queue<string> stderrRollingLog = new(MaxStderrLength);
139-
process.ErrorDataReceived += (sender, args) =>
140+
errorHandler = (sender, args) =>
140141
{
141142
string? data = args.Data;
142143
if (data is not null)
@@ -151,11 +152,22 @@ public async Task<ITransport> ConnectAsync(CancellationToken cancellationToken =
151152
stderrRollingLog.Enqueue(data);
152153
}
153154

154-
_options.StandardErrorLines?.Invoke(data);
155+
try
156+
{
157+
_options.StandardErrorLines?.Invoke(data);
158+
}
159+
catch (Exception ex)
160+
{
161+
// Prevent exceptions in the user callback from propagating
162+
// to the background thread that dispatches ErrorDataReceived,
163+
// which would crash the process.
164+
LogStderrCallbackFailed(logger, endpointName, ex);
165+
}
155166

156167
LogReadStderr(logger, endpointName, data);
157168
}
158169
};
170+
process.ErrorDataReceived += errorHandler;
159171

160172
// We need both stdin and stdout to use a no-BOM UTF-8 encoding. On .NET Core,
161173
// we can use ProcessStartInfo.StandardOutputEncoding/StandardInputEncoding, but
@@ -193,14 +205,19 @@ public async Task<ITransport> ConnectAsync(CancellationToken cancellationToken =
193205

194206
process.BeginErrorReadLine();
195207

196-
return new StdioClientSessionTransport(_options, process, endpointName, stderrRollingLog, _loggerFactory);
208+
return new StdioClientSessionTransport(_options, process, endpointName, stderrRollingLog, errorHandler, _loggerFactory);
197209
}
198210
catch (Exception ex)
199211
{
200212
LogTransportConnectFailed(logger, endpointName, ex);
201213

202214
try
203215
{
216+
if (process is not null && errorHandler is not null)
217+
{
218+
process.ErrorDataReceived -= errorHandler;
219+
}
220+
204221
DisposeProcess(process, processStarted, _options.ShutdownTimeout);
205222
}
206223
catch (Exception ex2)
@@ -228,18 +245,6 @@ internal static void DisposeProcess(
228245
process.KillTree(shutdownTimeout);
229246
}
230247

231-
// Ensure all redirected stderr/stdout events have been dispatched
232-
// before disposing. Only the no-arg WaitForExit() guarantees this;
233-
// WaitForExit(int) (as used by KillTree) does not.
234-
// This should not hang: either the process already exited on its own
235-
// (no child processes holding handles), or KillTree killed the entire
236-
// process tree. If it does take too long, the test infrastructure's
237-
// own timeout will catch it.
238-
if (!processRunning && HasExited(process))
239-
{
240-
process.WaitForExit();
241-
}
242-
243248
// Invoke the callback while the process handle is still valid,
244249
// e.g. to read ExitCode before Dispose() invalidates it.
245250
beforeDispose?.Invoke();
@@ -299,6 +304,9 @@ private static string EscapeArgumentString(string argument) =>
299304
[LoggerMessage(Level = LogLevel.Information, Message = "{EndpointName} received stderr log: '{Data}'.")]
300305
private static partial void LogReadStderr(ILogger logger, string endpointName, string data);
301306

307+
[LoggerMessage(Level = LogLevel.Warning, Message = "{EndpointName} StandardErrorLines callback failed.")]
308+
private static partial void LogStderrCallbackFailed(ILogger logger, string endpointName, Exception exception);
309+
302310
[LoggerMessage(Level = LogLevel.Information, Message = "{EndpointName} started server process with PID {ProcessId}.")]
303311
private static partial void LogTransportProcessStarted(ILogger logger, string endpointName, int processId);
304312

src/ModelContextProtocol.Core/Client/StreamClientSessionTransport.cs

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ internal class StreamClientSessionTransport : TransportBase
1515
private readonly TextReader _serverOutput;
1616
private readonly Stream _serverInputStream;
1717
private readonly SemaphoreSlim _sendLock = new(1, 1);
18-
private CancellationTokenSource? _shutdownCts = new();
18+
private readonly CancellationTokenSource _shutdownCts = new();
1919
private Task? _readTask;
2020

2121
/// <summary>
@@ -97,8 +97,16 @@ public override async Task SendMessageAsync(JsonRpcMessage message, Cancellation
9797
}
9898

9999
/// <inheritdoc/>
100-
public override ValueTask DisposeAsync() =>
101-
CleanupAsync(cancellationToken: CancellationToken.None);
100+
public override async ValueTask DisposeAsync()
101+
{
102+
await CleanupAsync(cancellationToken: CancellationToken.None).ConfigureAwait(false);
103+
104+
// Ensure the channel is always completed after disposal, even if CleanupAsync
105+
// returned early because another caller (e.g. ReadMessagesAsync) was already
106+
// running cleanup. SetDisconnected is idempotent—if the channel was already
107+
// completed by the other cleanup path, this is a no-op.
108+
SetDisconnected();
109+
}
102110

103111
private async Task ReadMessagesAsync(CancellationToken cancellationToken)
104112
{
@@ -168,15 +176,20 @@ private async Task ProcessMessageAsync(string line, CancellationToken cancellati
168176
}
169177
}
170178

179+
/// <summary>
180+
/// Cancels the shutdown token to signal that the transport is shutting down,
181+
/// without performing any other cleanup.
182+
/// </summary>
183+
protected void CancelShutdown()
184+
{
185+
_shutdownCts.Cancel();
186+
}
187+
171188
protected virtual async ValueTask CleanupAsync(Exception? error = null, CancellationToken cancellationToken = default)
172189
{
173190
LogTransportShuttingDown(Name);
174191

175-
if (Interlocked.Exchange(ref _shutdownCts, null) is { } shutdownCts)
176-
{
177-
await shutdownCts.CancelAsync().ConfigureAwait(false);
178-
shutdownCts.Dispose();
179-
}
192+
await _shutdownCts.CancelAsync().ConfigureAwait(false);
180193

181194
if (Interlocked.Exchange(ref _readTask, null) is Task readTask)
182195
{

tests/ModelContextProtocol.AspNetCore.Tests/ClientConformanceTests.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,24 +82,29 @@ public async Task RunConformanceTest(string scenario)
8282

8383
var process = new Process { StartInfo = startInfo };
8484

85-
process.OutputDataReceived += (sender, e) =>
85+
// Protect callbacks with try/catch to prevent ITestOutputHelper from
86+
// throwing on a background thread if events arrive after the test completes.
87+
DataReceivedEventHandler outputHandler = (sender, e) =>
8688
{
8789
if (e.Data != null)
8890
{
89-
_output.WriteLine(e.Data);
91+
try { _output.WriteLine(e.Data); } catch { }
9092
outputBuilder.AppendLine(e.Data);
9193
}
9294
};
9395

94-
process.ErrorDataReceived += (sender, e) =>
96+
DataReceivedEventHandler errorHandler = (sender, e) =>
9597
{
9698
if (e.Data != null)
9799
{
98-
_output.WriteLine(e.Data);
100+
try { _output.WriteLine(e.Data); } catch { }
99101
errorBuilder.AppendLine(e.Data);
100102
}
101103
};
102104

105+
process.OutputDataReceived += outputHandler;
106+
process.ErrorDataReceived += errorHandler;
107+
103108
process.Start();
104109
process.BeginOutputReadLine();
105110
process.BeginErrorReadLine();
@@ -112,13 +117,18 @@ public async Task RunConformanceTest(string scenario)
112117
catch (OperationCanceledException)
113118
{
114119
process.Kill(entireProcessTree: true);
120+
process.OutputDataReceived -= outputHandler;
121+
process.ErrorDataReceived -= errorHandler;
115122
return (
116123
Success: false,
117124
Output: outputBuilder.ToString(),
118125
Error: errorBuilder.ToString() + "\nProcess timed out after 5 minutes and was killed."
119126
);
120127
}
121128

129+
process.OutputDataReceived -= outputHandler;
130+
process.ErrorDataReceived -= errorHandler;
131+
122132
var output = outputBuilder.ToString();
123133
var error = errorBuilder.ToString();
124134
var success = process.ExitCode == 0 || HasOnlyWarnings(output, error);

tests/ModelContextProtocol.AspNetCore.Tests/ServerConformanceTests.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,24 +136,29 @@ public async Task RunPendingConformanceTest_ServerSsePolling()
136136

137137
var process = new Process { StartInfo = startInfo };
138138

139-
process.OutputDataReceived += (sender, e) =>
139+
// Protect callbacks with try/catch to prevent ITestOutputHelper from
140+
// throwing on a background thread if events arrive after the test completes.
141+
DataReceivedEventHandler outputHandler = (sender, e) =>
140142
{
141143
if (e.Data != null)
142144
{
143-
output.WriteLine(e.Data);
145+
try { output.WriteLine(e.Data); } catch { }
144146
outputBuilder.AppendLine(e.Data);
145147
}
146148
};
147149

148-
process.ErrorDataReceived += (sender, e) =>
150+
DataReceivedEventHandler errorHandler = (sender, e) =>
149151
{
150152
if (e.Data != null)
151153
{
152-
output.WriteLine(e.Data);
154+
try { output.WriteLine(e.Data); } catch { }
153155
errorBuilder.AppendLine(e.Data);
154156
}
155157
};
156158

159+
process.OutputDataReceived += outputHandler;
160+
process.ErrorDataReceived += errorHandler;
161+
157162
process.Start();
158163
process.BeginOutputReadLine();
159164
process.BeginErrorReadLine();
@@ -166,13 +171,18 @@ public async Task RunPendingConformanceTest_ServerSsePolling()
166171
catch (OperationCanceledException)
167172
{
168173
process.Kill(entireProcessTree: true);
174+
process.OutputDataReceived -= outputHandler;
175+
process.ErrorDataReceived -= errorHandler;
169176
return (
170177
Success: false,
171178
Output: outputBuilder.ToString(),
172179
Error: errorBuilder.ToString() + "\nProcess timed out after 5 minutes and was killed."
173180
);
174181
}
175182

183+
process.OutputDataReceived -= outputHandler;
184+
process.ErrorDataReceived -= errorHandler;
185+
176186
return (
177187
Success: process.ExitCode == 0,
178188
Output: outputBuilder.ToString(),

0 commit comments

Comments
 (0)