Skip to content

Commit fe6d58a

Browse files
committed
Fix OTLP integration test reliability
Replace fire-and-forget session clear with a retry helper that waits for the test-agent HTTP endpoint to be ready. Replace single-GET data fetch with a polling helper (WaitForTestAgentData) that retries for up to 30 seconds, accounting for export flush timing after process exit. Fix incorrect env var OTEL_LOG_EXPORT_INTERVAL (does not exist) to OTEL_BLRP_SCHEDULE_DELAY. Set to 500ms so the OTel SDK gets multiple periodic exports before LoggerProviderSdk.Dispose() hits its 5s shutdown timeout. This is especially important for gRPC, where the first export warms the HTTP/2 connection. Set OTEL_METRIC_EXPORT_INTERVAL to 60000ms so only the shutdown flush fires, preventing duplicate metric batches from observable instruments that broke snapshot comparison. Remove [Flaky] attributes from SubmitsOtlpMetrics and SubmitsOtlpLogs.
1 parent 9c1bec7 commit fe6d58a

1 file changed

Lines changed: 72 additions & 30 deletions

File tree

tracer/test/Datadog.Trace.ClrProfiler.IntegrationTests/OpenTelemetrySdkTests.cs

Lines changed: 72 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System;
77
using System.Collections.Generic;
88
using System.Linq;
9+
using System.Net.Http;
910
using System.Text.RegularExpressions;
1011
using System.Threading.Tasks;
1112
using Datadog.Trace.Configuration;
@@ -254,10 +255,7 @@ public async Task SubmitsOtlpTraces(string packageVersion, string datadogTracesE
254255
var testAgentHost = Environment.GetEnvironmentVariable("TEST_AGENT_HOST") ?? "localhost";
255256
var otlpPort = protocol == "grpc" ? 4317 : 4318;
256257

257-
using (var httpClient = new System.Net.Http.HttpClient())
258-
{
259-
await httpClient.GetAsync($"http://{testAgentHost}:4318/test/session/clear");
260-
}
258+
await ClearTestAgentSession(testAgentHost);
261259

262260
// This is the key configuration that is set differently from previous test cases:
263261
// OTEL_TRACES_EXPORTER=otlp enables the DD SDK to emit traces (and trace stats) via OTLP
@@ -497,7 +495,6 @@ await Verifier.Verify(finalJson, settings)
497495

498496
#if NET6_0_OR_GREATER
499497
[SkippableTheory]
500-
[Flaky("New test agent seems to not always be ready", maxRetries: 3)]
501498
[Trait("Category", "EndToEnd")]
502499
[MemberData(nameof(GetOtlpTestData))]
503500
public async Task SubmitsOtlpMetrics(string packageVersion, string datadogMetricsEnabled, string otelMetricsEnabled, string protocol, bool useAgentHostBackup)
@@ -518,18 +515,16 @@ public async Task SubmitsOtlpMetrics(string packageVersion, string datadogMetric
518515
var testAgentHost = Environment.GetEnvironmentVariable("TEST_AGENT_HOST") ?? "localhost";
519516
var otlpPort = protocol == "grpc" ? 4317 : 4318;
520517

521-
using (var httpClient = new System.Net.Http.HttpClient())
522-
{
523-
await httpClient.GetAsync($"http://{testAgentHost}:4318/test/session/clear");
524-
}
518+
await ClearTestAgentSession(testAgentHost);
525519

526520
SetEnvironmentVariable("DD_ENV", string.Empty);
527521
SetEnvironmentVariable("DD_SERVICE", string.Empty);
528522
SetEnvironmentVariable("DD_METRICS_OTEL_METER_NAMES", "OpenTelemetryMetricsMeter");
529523
SetEnvironmentVariable("DD_METRICS_OTEL_ENABLED", datadogMetricsEnabled);
530524
SetEnvironmentVariable("OTEL_METRICS_EXPORTER_ENABLED", otelMetricsEnabled);
531525
SetEnvironmentVariable("OTEL_EXPORTER_OTLP_PROTOCOL", protocol);
532-
SetEnvironmentVariable("OTEL_METRIC_EXPORT_INTERVAL", "1000");
526+
// 60s so only the shutdown flush fires; periodic exports of observable instruments produce duplicate batches that break snapshot comparison
527+
SetEnvironmentVariable("OTEL_METRIC_EXPORT_INTERVAL", "60000");
533528

534529
if (useAgentHostBackup)
535530
{
@@ -546,13 +541,7 @@ public async Task SubmitsOtlpMetrics(string packageVersion, string datadogMetric
546541
using var agent = EnvironmentHelper.GetMockAgent();
547542
using (await RunSampleAndWaitForExit(agent, packageVersion: packageVersion ?? "1.13.1"))
548543
{
549-
using var httpClient = new System.Net.Http.HttpClient();
550-
var metricsResponse = await httpClient.GetAsync($"http://{testAgentHost}:4318/test/session/metrics");
551-
metricsResponse.EnsureSuccessStatusCode();
552-
553-
var metricsJson = await metricsResponse.Content.ReadAsStringAsync();
554-
var metricsData = JToken.Parse(metricsJson);
555-
544+
var metricsData = await WaitForTestAgentData($"http://{testAgentHost}:4318/test/session/metrics");
556545
metricsData.Should().NotBeNullOrEmpty();
557546

558547
foreach (var attribute in metricsData.SelectTokens("$..resource.attributes[?(@.key == 'telemetry.sdk.version')]"))
@@ -594,7 +583,6 @@ await Verifier.Verify(formattedJson, settings)
594583

595584
#if NETCOREAPP3_1_OR_GREATER
596585
[SkippableTheory]
597-
[Flaky("New test agent seems to not always be ready", maxRetries: 3)]
598586
[Trait("Category", "EndToEnd")]
599587
[MemberData(nameof(GetOtlpTestData))]
600588
public async Task SubmitsOtlpLogs(string packageVersion, string datadogLogsEnabled, string otelLogsEnabled, string protocol, bool useAgentHostBackup)
@@ -613,18 +601,17 @@ public async Task SubmitsOtlpLogs(string packageVersion, string datadogLogsEnabl
613601
var testAgentHost = Environment.GetEnvironmentVariable("TEST_AGENT_HOST") ?? "localhost";
614602
var otlpPort = protocol == "grpc" ? 4317 : 4318;
615603

616-
using (var httpClient = new System.Net.Http.HttpClient())
617-
{
618-
await httpClient.GetAsync($"http://{testAgentHost}:4318/test/session/clear");
619-
}
604+
await ClearTestAgentSession(testAgentHost);
620605

621606
SetEnvironmentVariable("DD_ENV", "testing");
622607
SetEnvironmentVariable("DD_SERVICE", "OtlpLogsService");
623608
SetEnvironmentVariable("OTEL_RESOURCE_ATTRIBUTES", "service.name=OtlpLogsService,deployment.environment=testing");
624609
SetEnvironmentVariable("DD_LOGS_OTEL_ENABLED", datadogLogsEnabled);
625610
SetEnvironmentVariable("OTEL_LOGS_EXPORTER_ENABLED", otelLogsEnabled);
626611
SetEnvironmentVariable("OTEL_EXPORTER_OTLP_PROTOCOL", protocol);
627-
SetEnvironmentVariable("OTEL_LOG_EXPORT_INTERVAL", "1000");
612+
// Short delay gives the OTel SDK multiple periodic exports before LoggerProviderSdk.Dispose() hits its 5s shutdown timeout.
613+
// This is especially important for gRPC, where the first export warms the HTTP/2 connection.
614+
SetEnvironmentVariable("OTEL_BLRP_SCHEDULE_DELAY", "500");
628615
SetEnvironmentVariable("DD_LOGS_DIRECT_SUBMISSION_MINIMUM_LEVEL", "Verbose");
629616

630617
if (useAgentHostBackup)
@@ -643,13 +630,7 @@ public async Task SubmitsOtlpLogs(string packageVersion, string datadogLogsEnabl
643630
{
644631
var endTimeNanoseconds = DateTimeOffset.UtcNow.ToUnixTimeNanoseconds();
645632

646-
using var httpClient = new System.Net.Http.HttpClient();
647-
var logsResponse = await httpClient.GetAsync($"http://{testAgentHost}:4318/test/session/logs");
648-
logsResponse.EnsureSuccessStatusCode();
649-
650-
var logsJson = await logsResponse.Content.ReadAsStringAsync();
651-
var logsData = JToken.Parse(logsJson);
652-
633+
var logsData = await WaitForTestAgentData($"http://{testAgentHost}:4318/test/session/logs");
653634
logsData.Should().NotBeNullOrEmpty();
654635
logsData.SelectTokens("$..log_records[*]").Should().AllSatisfy(logRecord =>
655636
{
@@ -704,6 +685,67 @@ await Verifier.Verify(formattedJson, settings)
704685
}
705686
#endif
706687

688+
/// <summary>
689+
/// Clears the test-agent session, retrying if the agent is not yet ready.
690+
/// Ensures the OTLP HTTP endpoint is accepting connections before tests proceed.
691+
/// </summary>
692+
private static async Task ClearTestAgentSession(string testAgentHost, int maxRetries = 5, int delayMs = 1000)
693+
{
694+
using var httpClient = new HttpClient { Timeout = TimeSpan.FromSeconds(5) };
695+
var url = $"http://{testAgentHost}:4318/test/session/clear";
696+
697+
for (var attempt = 1; attempt <= maxRetries; attempt++)
698+
{
699+
try
700+
{
701+
var response = await httpClient.GetAsync(url);
702+
response.EnsureSuccessStatusCode();
703+
return;
704+
}
705+
catch (Exception) when (attempt < maxRetries)
706+
{
707+
await Task.Delay(delayMs);
708+
}
709+
}
710+
711+
// Final attempt -- let it throw if it fails
712+
var finalResponse = await httpClient.GetAsync(url);
713+
finalResponse.EnsureSuccessStatusCode();
714+
}
715+
716+
/// <summary>
717+
/// Polls the test-agent for data until non-empty results are returned or timeout is reached.
718+
/// The sample app exports data during shutdown, so there can be a brief delay
719+
/// between process exit and data appearing in the test-agent.
720+
/// </summary>
721+
private static async Task<JToken> WaitForTestAgentData(string url, int timeoutSeconds = 30, int pollIntervalMs = 500)
722+
{
723+
using var httpClient = new HttpClient { Timeout = TimeSpan.FromSeconds(10) };
724+
var deadline = DateTime.UtcNow.AddSeconds(timeoutSeconds);
725+
726+
while (DateTime.UtcNow < deadline)
727+
{
728+
var response = await httpClient.GetAsync(url);
729+
response.EnsureSuccessStatusCode();
730+
731+
var json = await response.Content.ReadAsStringAsync();
732+
var data = JToken.Parse(json);
733+
734+
if (data.HasValues)
735+
{
736+
return data;
737+
}
738+
739+
await Task.Delay(pollIntervalMs);
740+
}
741+
742+
// Final attempt -- return whatever we get so the caller's assertion shows the actual value
743+
var finalResponse = await httpClient.GetAsync(url);
744+
finalResponse.EnsureSuccessStatusCode();
745+
var finalJson = await finalResponse.Content.ReadAsStringAsync();
746+
return JToken.Parse(finalJson);
747+
}
748+
707749
private static string GetSuffix(string packageVersion)
708750
{
709751
// The snapshots are only different in .NET Core 2.1 - .NET 5 with package version 1.0.1

0 commit comments

Comments
 (0)