diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj b/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj index f5a5b5bd5d..869bddc313 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj +++ b/src/api/Elastic.Documentation.Mcp.Remote/Elastic.Documentation.Mcp.Remote.csproj @@ -29,6 +29,9 @@ <_Parameter1>Mcp.Remote.IntegrationTests + + <_Parameter1>Mcp.Remote.Tests + diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Health/ElasticsearchHealthCheck.cs b/src/api/Elastic.Documentation.Mcp.Remote/Health/ElasticsearchHealthCheck.cs new file mode 100644 index 0000000000..9f3b708c80 --- /dev/null +++ b/src/api/Elastic.Documentation.Mcp.Remote/Health/ElasticsearchHealthCheck.cs @@ -0,0 +1,37 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using Elastic.Documentation.Search.Common; +using Microsoft.Extensions.Diagnostics.HealthChecks; + +namespace Elastic.Documentation.Mcp.Remote.Health; + +/// +/// Readiness check that pings the Elasticsearch cluster. +/// Registered on /health (no tag predicate) so that a cluster outage returns 503, +/// giving infra/monitoring a direct signal. Intentionally excluded from /alive +/// (tagged "live") to prevent pod crash-loops during transient ES downtime. +/// +internal sealed class ElasticsearchHealthCheck(ElasticsearchClientAccessor accessor) : IHealthCheck +{ + public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken) + { + try + { + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(5)); + return await accessor.CanConnect(cts.Token) + ? HealthCheckResult.Healthy() + : HealthCheckResult.Unhealthy("Elasticsearch ping failed"); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + return HealthCheckResult.Unhealthy("Elasticsearch unreachable", ex); + } + } +} diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs index 5b5c70e22f..cbb90ce4d9 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs @@ -8,6 +8,7 @@ using Elastic.Documentation.LinkIndex; using Elastic.Documentation.Links.InboundLinks; using Elastic.Documentation.Mcp.Remote; +using Elastic.Documentation.Mcp.Remote.Health; using Elastic.Documentation.Mcp.Remote.Telemetry; using Elastic.Documentation.Search.Common; using Elastic.Documentation.ServiceDefaults; @@ -26,6 +27,10 @@ var builder = WebApplication.CreateSlimBuilder(args); _ = builder.AddDocumentationServiceDefaults(); _ = builder.AddDefaultHealthChecks(); + // Readiness check: pings Elasticsearch so /health returns 503 during an ES outage. + // Excluded from /alive (liveness) to avoid pod crash-loops. + _ = builder.Services.AddHealthChecks() + .AddCheck("elasticsearch", tags: ["ready"]); _ = builder.AddEuidEnrichment(); _ = builder.Services.ConfigureOpenTelemetryTracerProvider(t => t.AddSource(McpToolTelemetry.McpToolSourceName)); diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Responses/McpToolResults.cs b/src/api/Elastic.Documentation.Mcp.Remote/Responses/McpToolResults.cs new file mode 100644 index 0000000000..d3b6f090d8 --- /dev/null +++ b/src/api/Elastic.Documentation.Mcp.Remote/Responses/McpToolResults.cs @@ -0,0 +1,28 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using ModelContextProtocol.Protocol; + +namespace Elastic.Documentation.Mcp.Remote.Responses; + +/// +/// Factory helpers for building values consistently across all tools. +/// +internal static class McpToolResults +{ + /// + /// Wraps a success payload (already serialized to JSON) in a . + /// Equivalent to returning a plain string from a tool, but typed explicitly. + /// + public static CallToolResult Ok(string json) => + new() { Content = [new TextContentBlock { Text = json }] }; + + /// + /// Returns a with set so + /// the MCP client and model know the tool call failed. HTTP response remains 200 per + /// MCP/JSON-RPC spec. + /// + public static CallToolResult Error(string json) => + new() { IsError = true, Content = [new TextContentBlock { Text = json }] }; +} diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs b/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs index 3aa2424060..419a9e5690 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Telemetry/McpToolTelemetry.cs @@ -61,7 +61,25 @@ public static void MarkFailure(Activity? activity, Exception ex) _ = activity?.SetTag("mcp.call.success", false); _ = activity?.SetTag("mcp.call.error_type", ex.GetType().FullName); _ = activity?.SetTag("error.message", ex.Message); + _ = activity?.AddException(ex); _ = activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + FailServerSpan(ex); + } + + // Walk up to the ASP.NET Core server (transaction) span and mark it failed so that + // the HTTP transaction appears as a failing transaction in APM even though the HTTP + // response is 200 (correct per MCP/JSON-RPC spec). No-ops when there is no server + // ancestor, e.g. in unit tests. + private static void FailServerSpan(Exception ex) + { + for (var a = Activity.Current; a is not null; a = a.Parent) + { + if (a.Kind != ActivityKind.Server) + continue; + _ = a.SetStatus(ActivityStatusCode.Error, ex.Message); + _ = a.AddException(ex); + return; + } } public static void MarkFailure(Activity? activity, string errorType, string message) diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs b/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs index 021c245a20..a4022acaba 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Tools/SearchTools.cs @@ -10,6 +10,7 @@ using Elastic.Documentation.Mcp.Remote.Telemetry; using Elastic.Documentation.Search; using Microsoft.Extensions.Logging; +using ModelContextProtocol.Protocol; using ModelContextProtocol.Server; namespace Elastic.Documentation.Mcp.Remote.Tools; @@ -28,7 +29,7 @@ public class SearchTools(IFullSearchService fullSearchGateway, ILogger SemanticSearch( + public async Task SemanticSearch( [Description("The search query - can be a question or keywords")] string query, [Description("Page number (1-based, default: 1)")] int pageNumber = 1, [Description("Number of results per page (default: 10, max: 50)")] int pageSize = 10, @@ -87,7 +88,7 @@ public async Task SemanticSearch( McpToolTelemetry.MarkSuccess(activity); outcome = "success"; - return JsonSerializer.Serialize(response, McpJsonContext.Default.SemanticSearchResponse); + return McpToolResults.Ok(JsonSerializer.Serialize(response, McpJsonContext.Default.SemanticSearchResponse)); } catch (OperationCanceledException) { @@ -99,7 +100,7 @@ public async Task SemanticSearch( { McpToolTelemetry.MarkFailure(activity, ex); logger.LogError(ex, "SemanticSearch failed for query '{Query}'", query); - return JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse); + return McpToolResults.Error(JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse)); } finally { @@ -115,7 +116,7 @@ public async Task SemanticSearch( "Finds {docs} pages related to a given topic. " + "Use when exploring what documentation exists around a subject, building context for writing, " + "or discovering related content the user should be aware of.")] - public async Task FindRelatedDocs( + public async Task FindRelatedDocs( [Description("Topic or search terms to find related documents for")] string topic, [Description("Maximum number of related documents to return (default: 10)")] int limit = 10, [Description("Filter by product ID (e.g., 'elasticsearch', 'kibana')")] string? productFilter = null, @@ -165,7 +166,7 @@ public async Task FindRelatedDocs( McpToolTelemetry.MarkSuccess(activity); outcome = "success"; - return JsonSerializer.Serialize(response, McpJsonContext.Default.RelatedDocsResponse); + return McpToolResults.Ok(JsonSerializer.Serialize(response, McpJsonContext.Default.RelatedDocsResponse)); } catch (OperationCanceledException) { @@ -177,7 +178,7 @@ public async Task FindRelatedDocs( { McpToolTelemetry.MarkFailure(activity, ex); logger.LogError(ex, "FindRelatedDocs failed for topic '{Topic}'", topic); - return JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse); + return McpToolResults.Error(JsonSerializer.Serialize(new ErrorResponse(ex.Message), McpJsonContext.Default.ErrorResponse)); } finally { diff --git a/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs b/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs index 207dc6f82a..a8518ae003 100644 --- a/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs +++ b/tests-integration/Mcp.Remote.IntegrationTests/SearchToolsIntegrationTests.cs @@ -5,6 +5,7 @@ using System.Text.Json; using AwesomeAssertions; using Elastic.Documentation.Mcp.Remote.Responses; +using ModelContextProtocol.Protocol; namespace Mcp.Remote.IntegrationTests; @@ -24,11 +25,13 @@ public async Task SemanticSearch_ReturnsResults() Assert.SkipUnless(canConnect, "Elasticsearch is not connected"); // Act - var resultJson = await searchTools.SemanticSearch( + var result = await searchTools.SemanticSearch( "elasticsearch getting started", cancellationToken: TestContext.Current.CancellationToken); // Assert + result.IsError.Should().BeFalse("SemanticSearch should not return an error result"); + var resultJson = ((TextContentBlock)result.Content[0]).Text; Output.WriteLine($"Result: {resultJson}"); var response = JsonSerializer.Deserialize(resultJson, McpJsonContext.Default.SemanticSearchResponse); @@ -53,12 +56,14 @@ public async Task SemanticSearch_WithProductFilter() Assert.SkipUnless(canConnect, "Elasticsearch is not connected"); // Act - var resultJson = await searchTools.SemanticSearch( + var result = await searchTools.SemanticSearch( "getting started", productFilter: "elasticsearch", cancellationToken: TestContext.Current.CancellationToken); // Assert + result.IsError.Should().BeFalse("SemanticSearch with product filter should not return an error result"); + var resultJson = ((TextContentBlock)result.Content[0]).Text; Output.WriteLine($"Result: {resultJson}"); var response = JsonSerializer.Deserialize(resultJson, McpJsonContext.Default.SemanticSearchResponse); @@ -81,12 +86,14 @@ public async Task FindRelatedDocs_ReturnsRelated() Assert.SkipUnless(canConnect, "Elasticsearch is not connected"); // Act - var resultJson = await searchTools.FindRelatedDocs( + var result = await searchTools.FindRelatedDocs( "data streams", limit: 5, cancellationToken: TestContext.Current.CancellationToken); // Assert + result.IsError.Should().BeFalse("FindRelatedDocs should not return an error result"); + var resultJson = ((TextContentBlock)result.Content[0]).Text; Output.WriteLine($"Result: {resultJson}"); var response = JsonSerializer.Deserialize(resultJson, McpJsonContext.Default.RelatedDocsResponse); diff --git a/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs b/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs index 2655e6bb9f..d5a17f6acc 100644 --- a/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs +++ b/tests/Mcp.Remote.Tests/McpToolTelemetryTests.cs @@ -2,6 +2,7 @@ // Elasticsearch B.V licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information +using System.Collections.Generic; using System.Diagnostics; using Elastic.Documentation.Configuration; using Elastic.Documentation.Mcp.Remote; @@ -91,6 +92,52 @@ public void MarkFailure_SetsFailureTagsAndErrorStatus() tags["error.message"].Should().Be("gateway failed"); activity.Status.Should().Be(ActivityStatusCode.Error); activity.StatusDescription.Should().Be("gateway failed"); + activity.Events.Should().Contain(e => e.Name == "exception"); + } + + [Fact] + public void MarkFailure_PropagatesErrorStatusToServerAncestor() + { + using var serverSource = new ActivitySource("Test.McpServer"); + using var listener = CreateListenerForSources(McpToolTelemetry.McpToolSourceName, "Test.McpServer"); + + // Simulate the ASP.NET Core server span (the HTTP transaction in APM). + using var serverActivity = serverSource.StartActivity("POST /mcp", ActivityKind.Server); + serverActivity.Should().NotBeNull(); + + // Tool span is a child of the server span (matches runtime Activity parenting). + using var toolActivity = McpToolTelemetry.StartActivity("test_tool"); + toolActivity.Should().NotBeNull(); + toolActivity!.Parent.Should().Be(serverActivity); + + var ex = new InvalidOperationException("ES is down"); + McpToolTelemetry.MarkFailure(toolActivity, ex); + + // Tool span carries the error and exception event. + toolActivity.Status.Should().Be(ActivityStatusCode.Error); + toolActivity.Events.Should().Contain(e => e.Name == "exception"); + + // Server/transaction span is also marked Error — this makes the HTTP transaction + // appear as a failing transaction in APM despite the 200 response code. + serverActivity!.Status.Should().Be(ActivityStatusCode.Error); + serverActivity.StatusDescription.Should().Be("ES is down"); + serverActivity.Events.Should().Contain(e => e.Name == "exception"); + } + + [Fact] + public void MarkFailure_NoServerAncestor_DoesNotThrow() + { + // In unit tests (and any context without an active server span) FailServerSpan + // should be a no-op rather than failing. + using var listener = CreateListener(); + using var activity = McpToolTelemetry.StartActivity("test_tool"); + activity.Should().NotBeNull(); + + var ex = new InvalidOperationException("isolated failure"); + var act = () => McpToolTelemetry.MarkFailure(activity, ex); + + act.Should().NotThrow(); + activity!.Status.Should().Be(ActivityStatusCode.Error); } [Fact] @@ -109,14 +156,17 @@ public void MarkCancelled_SetsCancelledTagAndErrorStatus() activity.StatusDescription.Should().Be("cancelled"); } - private static ActivityListener CreateListener() + private static ActivityListener CreateListener() => + CreateListenerForSources(McpToolTelemetry.McpToolSourceName); + + private static ActivityListener CreateListenerForSources(params string[] sourceNames) { + var nameSet = new HashSet(sourceNames, StringComparer.Ordinal); var listener = new ActivityListener { - ShouldListenTo = source => source.Name == "Elastic.Documentation.Api.McpTools", + ShouldListenTo = source => nameSet.Contains(source.Name), Sample = static (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded }; - ActivitySource.AddActivityListener(listener); return listener; }