Skip to content

Commit 4ebbcad

Browse files
reakaleekclaude
andcommitted
MCP: surface failing transactions when Elasticsearch is unhealthy
- MarkFailure now calls AddException and propagates ActivityStatusCode.Error to the root server span so APM shows the HTTP transaction as failed (not 200 OK) - SearchTools returns CallToolResult { IsError = true } on exception so the MCP client/model knows the tool actually failed - ElasticsearchHealthCheck wires the existing CanConnect ping into /health (returns 503 when ES is unreachable) without touching /alive (liveness) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 576e12b commit 4ebbcad

7 files changed

Lines changed: 147 additions & 9 deletions

File tree

src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
<AssemblyAttribute Include="System.Runtime.CompilerServices.InternalsVisibleTo">
3030
<_Parameter1>Mcp.Remote.IntegrationTests</_Parameter1>
3131
</AssemblyAttribute>
32+
<AssemblyAttribute Include="System.Runtime.CompilerServices.InternalsVisibleTo">
33+
<_Parameter1>Mcp.Remote.Tests</_Parameter1>
34+
</AssemblyAttribute>
3235
</ItemGroup>
3336

3437
<ItemGroup>
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using Elastic.Documentation.Search.Common;
6+
using Microsoft.Extensions.Diagnostics.HealthChecks;
7+
8+
namespace Elastic.Documentation.Mcp.Remote.Health;
9+
10+
/// <summary>
11+
/// Readiness check that pings the Elasticsearch cluster.
12+
/// Registered on <c>/health</c> (no tag predicate) so that a cluster outage returns 503,
13+
/// giving infra/monitoring a direct signal. Intentionally excluded from <c>/alive</c>
14+
/// (tagged <c>"live"</c>) to prevent pod crash-loops during transient ES downtime.
15+
/// </summary>
16+
internal sealed class ElasticsearchHealthCheck(ElasticsearchClientAccessor accessor) : IHealthCheck
17+
{
18+
public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken)
19+
{
20+
try
21+
{
22+
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
23+
cts.CancelAfter(TimeSpan.FromSeconds(5));
24+
return await accessor.CanConnect(cts.Token)
25+
? HealthCheckResult.Healthy()
26+
: HealthCheckResult.Unhealthy("Elasticsearch ping failed");
27+
}
28+
catch (Exception ex)
29+
{
30+
return HealthCheckResult.Unhealthy("Elasticsearch unreachable", ex);
31+
}
32+
}
33+
}

src/api/Elastic.Documentation.Mcp.Remote/Program.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
using Elastic.Documentation.LinkIndex;
99
using Elastic.Documentation.Links.InboundLinks;
1010
using Elastic.Documentation.Mcp.Remote;
11+
using Elastic.Documentation.Mcp.Remote.Health;
1112
using Elastic.Documentation.Mcp.Remote.Telemetry;
1213
using Elastic.Documentation.Search.Common;
1314
using Elastic.Documentation.ServiceDefaults;
@@ -26,6 +27,10 @@
2627
var builder = WebApplication.CreateSlimBuilder(args);
2728
_ = builder.AddDocumentationServiceDefaults();
2829
_ = builder.AddDefaultHealthChecks();
30+
// Readiness check: pings Elasticsearch so /health returns 503 during an ES outage.
31+
// Excluded from /alive (liveness) to avoid pod crash-loops.
32+
_ = builder.Services.AddHealthChecks()
33+
.AddCheck<ElasticsearchHealthCheck>("elasticsearch", tags: ["ready"]);
2934
_ = builder.AddEuidEnrichment();
3035
_ = builder.Services.ConfigureOpenTelemetryTracerProvider(t =>
3136
t.AddSource(McpToolTelemetry.McpToolSourceName));
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using ModelContextProtocol.Protocol;
6+
7+
namespace Elastic.Documentation.Mcp.Remote.Responses;
8+
9+
/// <summary>
10+
/// Factory helpers for building <see cref="CallToolResult"/> values consistently across all tools.
11+
/// </summary>
12+
internal static class McpToolResults
13+
{
14+
/// <summary>
15+
/// Wraps a success payload (already serialized to JSON) in a <see cref="CallToolResult"/>.
16+
/// Equivalent to returning a plain <c>string</c> from a tool, but typed explicitly.
17+
/// </summary>
18+
public static CallToolResult Ok(string json) =>
19+
new() { Content = [new TextContentBlock { Text = json }] };
20+
21+
/// <summary>
22+
/// Returns a <see cref="CallToolResult"/> with <see cref="CallToolResult.IsError"/> set so
23+
/// the MCP client and model know the tool call failed. HTTP response remains 200 per
24+
/// MCP/JSON-RPC spec.
25+
/// </summary>
26+
public static CallToolResult Error(string json) =>
27+
new() { IsError = true, Content = [new TextContentBlock { Text = json }] };
28+
}

src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,25 @@ public static void MarkFailure(Activity? activity, Exception ex)
6161
_ = activity?.SetTag("mcp.call.success", false);
6262
_ = activity?.SetTag("mcp.call.error_type", ex.GetType().FullName);
6363
_ = activity?.SetTag("error.message", ex.Message);
64+
_ = activity?.AddException(ex);
6465
_ = activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
66+
FailServerSpan(ex);
67+
}
68+
69+
// Walk up to the ASP.NET Core server (transaction) span and mark it failed so that
70+
// the HTTP transaction appears as a failing transaction in APM even though the HTTP
71+
// response is 200 (correct per MCP/JSON-RPC spec). No-ops when there is no server
72+
// ancestor, e.g. in unit tests.
73+
private static void FailServerSpan(Exception ex)
74+
{
75+
for (var a = Activity.Current; a is not null; a = a.Parent)
76+
{
77+
if (a.Kind != ActivityKind.Server)
78+
continue;
79+
_ = a.SetStatus(ActivityStatusCode.Error, ex.Message);
80+
_ = a.AddException(ex);
81+
return;
82+
}
6583
}
6684

6785
public static void MarkFailure(Activity? activity, string errorType, string message)

src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
using Elastic.Documentation.Mcp.Remote.Telemetry;
1111
using Elastic.Documentation.Search;
1212
using Microsoft.Extensions.Logging;
13+
using ModelContextProtocol.Protocol;
1314
using ModelContextProtocol.Server;
1415

1516
namespace Elastic.Documentation.Mcp.Remote.Tools;
@@ -28,7 +29,7 @@ public class SearchTools(IFullSearchService fullSearchGateway, ILogger<SearchToo
2829
"Use when the user asks about Elastic product features, needs to find existing docs pages, " +
2930
"verify published content, or research what documentation exists on a topic. " +
3031
"Returns relevant documents with AI summaries, relevance scores, and navigation context.")]
31-
public async Task<string> SemanticSearch(
32+
public async Task<CallToolResult> SemanticSearch(
3233
[Description("The search query - can be a question or keywords")] string query,
3334
[Description("Page number (1-based, default: 1)")] int pageNumber = 1,
3435
[Description("Number of results per page (default: 10, max: 50)")] int pageSize = 10,
@@ -87,7 +88,7 @@ public async Task<string> SemanticSearch(
8788

8889
McpToolTelemetry.MarkSuccess(activity);
8990
outcome = "success";
90-
return JsonSerializer.Serialize(response, McpJsonContext.Default.SemanticSearchResponse);
91+
return McpToolResults.Ok(JsonSerializer.Serialize(response, McpJsonContext.Default.SemanticSearchResponse));
9192
}
9293
catch (OperationCanceledException)
9394
{
@@ -99,7 +100,7 @@ public async Task<string> SemanticSearch(
99100
{
100101
McpToolTelemetry.MarkFailure(activity, ex);
101102
logger.LogError(ex, "SemanticSearch failed for query '{Query}'", query);
102-
return JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse);
103+
return McpToolResults.Error(JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse));
103104
}
104105
finally
105106
{
@@ -115,7 +116,7 @@ public async Task<string> SemanticSearch(
115116
"Finds {docs} pages related to a given topic. " +
116117
"Use when exploring what documentation exists around a subject, building context for writing, " +
117118
"or discovering related content the user should be aware of.")]
118-
public async Task<string> FindRelatedDocs(
119+
public async Task<CallToolResult> FindRelatedDocs(
119120
[Description("Topic or search terms to find related documents for")] string topic,
120121
[Description("Maximum number of related documents to return (default: 10)")] int limit = 10,
121122
[Description("Filter by product ID (e.g., 'elasticsearch', 'kibana')")] string? productFilter = null,
@@ -165,7 +166,7 @@ public async Task<string> FindRelatedDocs(
165166

166167
McpToolTelemetry.MarkSuccess(activity);
167168
outcome = "success";
168-
return JsonSerializer.Serialize(response, McpJsonContext.Default.RelatedDocsResponse);
169+
return McpToolResults.Ok(JsonSerializer.Serialize(response, McpJsonContext.Default.RelatedDocsResponse));
169170
}
170171
catch (OperationCanceledException)
171172
{
@@ -177,7 +178,7 @@ public async Task<string> FindRelatedDocs(
177178
{
178179
McpToolTelemetry.MarkFailure(activity, ex);
179180
logger.LogError(ex, "FindRelatedDocs failed for topic '{Topic}'", topic);
180-
return JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse);
181+
return McpToolResults.Error(JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse));
181182
}
182183
finally
183184
{

tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
33
// See the LICENSE file in the project root for more information
44

5+
using System.Collections.Generic;
56
using System.Diagnostics;
67
using Elastic.Documentation.Configuration;
78
using Elastic.Documentation.Mcp.Remote;
@@ -91,6 +92,52 @@ public void MarkFailure_SetsFailureTagsAndErrorStatus()
9192
tags["error.message"].Should().Be("gateway failed");
9293
activity.Status.Should().Be(ActivityStatusCode.Error);
9394
activity.StatusDescription.Should().Be("gateway failed");
95+
activity.Events.Should().Contain(e => e.Name == "exception");
96+
}
97+
98+
[Fact]
99+
public void MarkFailure_PropagatesErrorStatusToServerAncestor()
100+
{
101+
using var serverSource = new ActivitySource("Test.McpServer");
102+
using var listener = CreateListenerForSources(McpToolTelemetry.McpToolSourceName, "Test.McpServer");
103+
104+
// Simulate the ASP.NET Core server span (the HTTP transaction in APM).
105+
using var serverActivity = serverSource.StartActivity("POST /mcp", ActivityKind.Server);
106+
serverActivity.Should().NotBeNull();
107+
108+
// Tool span is a child of the server span (matches runtime Activity parenting).
109+
using var toolActivity = McpToolTelemetry.StartActivity("test_tool");
110+
toolActivity.Should().NotBeNull();
111+
toolActivity!.Parent.Should().Be(serverActivity);
112+
113+
var ex = new InvalidOperationException("ES is down");
114+
McpToolTelemetry.MarkFailure(toolActivity, ex);
115+
116+
// Tool span carries the error and exception event.
117+
toolActivity.Status.Should().Be(ActivityStatusCode.Error);
118+
toolActivity.Events.Should().Contain(e => e.Name == "exception");
119+
120+
// Server/transaction span is also marked Error — this makes the HTTP transaction
121+
// appear as a failing transaction in APM despite the 200 response code.
122+
serverActivity!.Status.Should().Be(ActivityStatusCode.Error);
123+
serverActivity.StatusDescription.Should().Be("ES is down");
124+
serverActivity.Events.Should().Contain(e => e.Name == "exception");
125+
}
126+
127+
[Fact]
128+
public void MarkFailure_NoServerAncestor_DoesNotThrow()
129+
{
130+
// In unit tests (and any context without an active server span) FailServerSpan
131+
// should be a no-op rather than failing.
132+
using var listener = CreateListener();
133+
using var activity = McpToolTelemetry.StartActivity("test_tool");
134+
activity.Should().NotBeNull();
135+
136+
var ex = new InvalidOperationException("isolated failure");
137+
var act = () => McpToolTelemetry.MarkFailure(activity, ex);
138+
139+
act.Should().NotThrow();
140+
activity!.Status.Should().Be(ActivityStatusCode.Error);
94141
}
95142

96143
[Fact]
@@ -109,14 +156,17 @@ public void MarkCancelled_SetsCancelledTagAndErrorStatus()
109156
activity.StatusDescription.Should().Be("cancelled");
110157
}
111158

112-
private static ActivityListener CreateListener()
159+
private static ActivityListener CreateListener() =>
160+
CreateListenerForSources(McpToolTelemetry.McpToolSourceName);
161+
162+
private static ActivityListener CreateListenerForSources(params string[] sourceNames)
113163
{
164+
var nameSet = new HashSet<string>(sourceNames, StringComparer.Ordinal);
114165
var listener = new ActivityListener
115166
{
116-
ShouldListenTo = source => source.Name == "Elastic.Documentation.Api.McpTools",
167+
ShouldListenTo = source => nameSet.Contains(source.Name),
117168
Sample = static (ref ActivityCreationOptions<ActivityContext> _) => ActivitySamplingResult.AllDataAndRecorded
118169
};
119-
120170
ActivitySource.AddActivityListener(listener);
121171
return listener;
122172
}

0 commit comments

Comments
 (0)