diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj b/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj
index f5a5b5bd5d..869bddc313 100644
--- a/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj
+++ b/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj
@@ -29,6 +29,9 @@
<_Parameter1>Mcp.Remote.IntegrationTests
+
+ <_Parameter1>Mcp.Remote.Tests
+
diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Health/ElasticsearchHealthCheck.cs b/src/api/Elastic.Documentation.Mcp.Remote/Health/ElasticsearchHealthCheck.cs
new file mode 100644
index 0000000000..9f3b708c80
--- /dev/null
+++ b/src/api/Elastic.Documentation.Mcp.Remote/Health/ElasticsearchHealthCheck.cs
@@ -0,0 +1,37 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using Elastic.Documentation.Search.Common;
+using Microsoft.Extensions.Diagnostics.HealthChecks;
+
+namespace Elastic.Documentation.Mcp.Remote.Health;
+
+///
+/// Readiness check that pings the Elasticsearch cluster.
+/// Registered on /health (no tag predicate) so that a cluster outage returns 503,
+/// giving infra/monitoring a direct signal. Intentionally excluded from /alive
+/// (tagged "live") to prevent pod crash-loops during transient ES downtime.
+///
+internal sealed class ElasticsearchHealthCheck(ElasticsearchClientAccessor accessor) : IHealthCheck
+{
+ public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken)
+ {
+ try
+ {
+ using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+ cts.CancelAfter(TimeSpan.FromSeconds(5));
+ return await accessor.CanConnect(cts.Token)
+ ? HealthCheckResult.Healthy()
+ : HealthCheckResult.Unhealthy("Elasticsearch ping failed");
+ }
+ catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
+ {
+ throw;
+ }
+ catch (Exception ex)
+ {
+ return HealthCheckResult.Unhealthy("Elasticsearch unreachable", ex);
+ }
+ }
+}
diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs
index 5b5c70e22f..cbb90ce4d9 100644
--- a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs
+++ b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs
@@ -8,6 +8,7 @@
using Elastic.Documentation.LinkIndex;
using Elastic.Documentation.Links.InboundLinks;
using Elastic.Documentation.Mcp.Remote;
+using Elastic.Documentation.Mcp.Remote.Health;
using Elastic.Documentation.Mcp.Remote.Telemetry;
using Elastic.Documentation.Search.Common;
using Elastic.Documentation.ServiceDefaults;
@@ -26,6 +27,10 @@
var builder = WebApplication.CreateSlimBuilder(args);
_ = builder.AddDocumentationServiceDefaults();
_ = builder.AddDefaultHealthChecks();
+ // Readiness check: pings Elasticsearch so /health returns 503 during an ES outage.
+ // Excluded from /alive (liveness) to avoid pod crash-loops.
+ _ = builder.Services.AddHealthChecks()
+ .AddCheck("elasticsearch", tags: ["ready"]);
_ = builder.AddEuidEnrichment();
_ = builder.Services.ConfigureOpenTelemetryTracerProvider(t =>
t.AddSource(McpToolTelemetry.McpToolSourceName));
diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Responses/McpToolResults.cs b/src/api/Elastic.Documentation.Mcp.Remote/Responses/McpToolResults.cs
new file mode 100644
index 0000000000..d3b6f090d8
--- /dev/null
+++ b/src/api/Elastic.Documentation.Mcp.Remote/Responses/McpToolResults.cs
@@ -0,0 +1,28 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using ModelContextProtocol.Protocol;
+
+namespace Elastic.Documentation.Mcp.Remote.Responses;
+
+///
+/// Factory helpers for building values consistently across all tools.
+///
+internal static class McpToolResults
+{
+ ///
+ /// Wraps a success payload (already serialized to JSON) in a .
+ /// Equivalent to returning a plain string from a tool, but typed explicitly.
+ ///
+ public static CallToolResult Ok(string json) =>
+ new() { Content = [new TextContentBlock { Text = json }] };
+
+ ///
+ /// Returns a with set so
+ /// the MCP client and model know the tool call failed. HTTP response remains 200 per
+ /// MCP/JSON-RPC spec.
+ ///
+ public static CallToolResult Error(string json) =>
+ new() { IsError = true, Content = [new TextContentBlock { Text = json }] };
+}
diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs b/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs
index 3aa2424060..419a9e5690 100644
--- a/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs
+++ b/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs
@@ -61,7 +61,25 @@ public static void MarkFailure(Activity? activity, Exception ex)
_ = activity?.SetTag("mcp.call.success", false);
_ = activity?.SetTag("mcp.call.error_type", ex.GetType().FullName);
_ = activity?.SetTag("error.message", ex.Message);
+ _ = activity?.AddException(ex);
_ = activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
+ FailServerSpan(ex);
+ }
+
+ // Walk up to the ASP.NET Core server (transaction) span and mark it failed so that
+ // the HTTP transaction appears as a failing transaction in APM even though the HTTP
+ // response is 200 (correct per MCP/JSON-RPC spec). No-ops when there is no server
+ // ancestor, e.g. in unit tests.
+ private static void FailServerSpan(Exception ex)
+ {
+ for (var a = Activity.Current; a is not null; a = a.Parent)
+ {
+ if (a.Kind != ActivityKind.Server)
+ continue;
+ _ = a.SetStatus(ActivityStatusCode.Error, ex.Message);
+ _ = a.AddException(ex);
+ return;
+ }
}
public static void MarkFailure(Activity? activity, string errorType, string message)
diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs b/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs
index 021c245a20..a4022acaba 100644
--- a/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs
+++ b/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs
@@ -10,6 +10,7 @@
using Elastic.Documentation.Mcp.Remote.Telemetry;
using Elastic.Documentation.Search;
using Microsoft.Extensions.Logging;
+using ModelContextProtocol.Protocol;
using ModelContextProtocol.Server;
namespace Elastic.Documentation.Mcp.Remote.Tools;
@@ -28,7 +29,7 @@ public class SearchTools(IFullSearchService fullSearchGateway, ILogger SemanticSearch(
+ public async Task SemanticSearch(
[Description("The search query - can be a question or keywords")] string query,
[Description("Page number (1-based, default: 1)")] int pageNumber = 1,
[Description("Number of results per page (default: 10, max: 50)")] int pageSize = 10,
@@ -87,7 +88,7 @@ public async Task SemanticSearch(
McpToolTelemetry.MarkSuccess(activity);
outcome = "success";
- return JsonSerializer.Serialize(response, McpJsonContext.Default.SemanticSearchResponse);
+ return McpToolResults.Ok(JsonSerializer.Serialize(response, McpJsonContext.Default.SemanticSearchResponse));
}
catch (OperationCanceledException)
{
@@ -99,7 +100,7 @@ public async Task SemanticSearch(
{
McpToolTelemetry.MarkFailure(activity, ex);
logger.LogError(ex, "SemanticSearch failed for query '{Query}'", query);
- return JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse);
+ return McpToolResults.Error(JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse));
}
finally
{
@@ -115,7 +116,7 @@ public async Task SemanticSearch(
"Finds {docs} pages related to a given topic. " +
"Use when exploring what documentation exists around a subject, building context for writing, " +
"or discovering related content the user should be aware of.")]
- public async Task FindRelatedDocs(
+ public async Task FindRelatedDocs(
[Description("Topic or search terms to find related documents for")] string topic,
[Description("Maximum number of related documents to return (default: 10)")] int limit = 10,
[Description("Filter by product ID (e.g., 'elasticsearch', 'kibana')")] string? productFilter = null,
@@ -165,7 +166,7 @@ public async Task FindRelatedDocs(
McpToolTelemetry.MarkSuccess(activity);
outcome = "success";
- return JsonSerializer.Serialize(response, McpJsonContext.Default.RelatedDocsResponse);
+ return McpToolResults.Ok(JsonSerializer.Serialize(response, McpJsonContext.Default.RelatedDocsResponse));
}
catch (OperationCanceledException)
{
@@ -177,7 +178,7 @@ public async Task FindRelatedDocs(
{
McpToolTelemetry.MarkFailure(activity, ex);
logger.LogError(ex, "FindRelatedDocs failed for topic '{Topic}'", topic);
- return JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse);
+ return McpToolResults.Error(JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse));
}
finally
{
diff --git a/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs b/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs
index 207dc6f82a..a8518ae003 100644
--- a/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs
+++ b/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs
@@ -5,6 +5,7 @@
using System.Text.Json;
using AwesomeAssertions;
using Elastic.Documentation.Mcp.Remote.Responses;
+using ModelContextProtocol.Protocol;
namespace Mcp.Remote.IntegrationTests;
@@ -24,11 +25,13 @@ public async Task SemanticSearch_ReturnsResults()
Assert.SkipUnless(canConnect, "Elasticsearch is not connected");
// Act
- var resultJson = await searchTools.SemanticSearch(
+ var result = await searchTools.SemanticSearch(
"elasticsearch getting started",
cancellationToken: TestContext.Current.CancellationToken);
// Assert
+ result.IsError.Should().BeFalse("SemanticSearch should not return an error result");
+ var resultJson = ((TextContentBlock)result.Content[0]).Text;
Output.WriteLine($"Result: {resultJson}");
var response = JsonSerializer.Deserialize(resultJson, McpJsonContext.Default.SemanticSearchResponse);
@@ -53,12 +56,14 @@ public async Task SemanticSearch_WithProductFilter()
Assert.SkipUnless(canConnect, "Elasticsearch is not connected");
// Act
- var resultJson = await searchTools.SemanticSearch(
+ var result = await searchTools.SemanticSearch(
"getting started",
productFilter: "elasticsearch",
cancellationToken: TestContext.Current.CancellationToken);
// Assert
+ result.IsError.Should().BeFalse("SemanticSearch with product filter should not return an error result");
+ var resultJson = ((TextContentBlock)result.Content[0]).Text;
Output.WriteLine($"Result: {resultJson}");
var response = JsonSerializer.Deserialize(resultJson, McpJsonContext.Default.SemanticSearchResponse);
@@ -81,12 +86,14 @@ public async Task FindRelatedDocs_ReturnsRelated()
Assert.SkipUnless(canConnect, "Elasticsearch is not connected");
// Act
- var resultJson = await searchTools.FindRelatedDocs(
+ var result = await searchTools.FindRelatedDocs(
"data streams",
limit: 5,
cancellationToken: TestContext.Current.CancellationToken);
// Assert
+ result.IsError.Should().BeFalse("FindRelatedDocs should not return an error result");
+ var resultJson = ((TextContentBlock)result.Content[0]).Text;
Output.WriteLine($"Result: {resultJson}");
var response = JsonSerializer.Deserialize(resultJson, McpJsonContext.Default.RelatedDocsResponse);
diff --git a/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs b/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs
index 2655e6bb9f..d5a17f6acc 100644
--- a/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs
+++ b/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs
@@ -2,6 +2,7 @@
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information
+using System.Collections.Generic;
using System.Diagnostics;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Mcp.Remote;
@@ -91,6 +92,52 @@ public void MarkFailure_SetsFailureTagsAndErrorStatus()
tags["error.message"].Should().Be("gateway failed");
activity.Status.Should().Be(ActivityStatusCode.Error);
activity.StatusDescription.Should().Be("gateway failed");
+ activity.Events.Should().Contain(e => e.Name == "exception");
+ }
+
+ [Fact]
+ public void MarkFailure_PropagatesErrorStatusToServerAncestor()
+ {
+ using var serverSource = new ActivitySource("Test.McpServer");
+ using var listener = CreateListenerForSources(McpToolTelemetry.McpToolSourceName, "Test.McpServer");
+
+ // Simulate the ASP.NET Core server span (the HTTP transaction in APM).
+ using var serverActivity = serverSource.StartActivity("POST /mcp", ActivityKind.Server);
+ serverActivity.Should().NotBeNull();
+
+ // Tool span is a child of the server span (matches runtime Activity parenting).
+ using var toolActivity = McpToolTelemetry.StartActivity("test_tool");
+ toolActivity.Should().NotBeNull();
+ toolActivity!.Parent.Should().Be(serverActivity);
+
+ var ex = new InvalidOperationException("ES is down");
+ McpToolTelemetry.MarkFailure(toolActivity, ex);
+
+ // Tool span carries the error and exception event.
+ toolActivity.Status.Should().Be(ActivityStatusCode.Error);
+ toolActivity.Events.Should().Contain(e => e.Name == "exception");
+
+ // Server/transaction span is also marked Error — this makes the HTTP transaction
+ // appear as a failing transaction in APM despite the 200 response code.
+ serverActivity!.Status.Should().Be(ActivityStatusCode.Error);
+ serverActivity.StatusDescription.Should().Be("ES is down");
+ serverActivity.Events.Should().Contain(e => e.Name == "exception");
+ }
+
+ [Fact]
+ public void MarkFailure_NoServerAncestor_DoesNotThrow()
+ {
+ // In unit tests (and any context without an active server span) FailServerSpan
+ // should be a no-op rather than failing.
+ using var listener = CreateListener();
+ using var activity = McpToolTelemetry.StartActivity("test_tool");
+ activity.Should().NotBeNull();
+
+ var ex = new InvalidOperationException("isolated failure");
+ var act = () => McpToolTelemetry.MarkFailure(activity, ex);
+
+ act.Should().NotThrow();
+ activity!.Status.Should().Be(ActivityStatusCode.Error);
}
[Fact]
@@ -109,14 +156,17 @@ public void MarkCancelled_SetsCancelledTagAndErrorStatus()
activity.StatusDescription.Should().Be("cancelled");
}
- private static ActivityListener CreateListener()
+ private static ActivityListener CreateListener() =>
+ CreateListenerForSources(McpToolTelemetry.McpToolSourceName);
+
+ private static ActivityListener CreateListenerForSources(params string[] sourceNames)
{
+ var nameSet = new HashSet(sourceNames, StringComparer.Ordinal);
var listener = new ActivityListener
{
- ShouldListenTo = source => source.Name == "Elastic.Documentation.Api.McpTools",
+ ShouldListenTo = source => nameSet.Contains(source.Name),
Sample = static (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded
};
-
ActivitySource.AddActivityListener(listener);
return listener;
}