Add missing envoy.vhost.vcluster.upstream_rq_time.99_5percentile to metadata.csv (DataDog#23770)

AAraKKe · web-flow · commit 656fac7566be · 2026-05-21T14:24:09.000Z
* Add missing envoy.vhost.vcluster.upstream_rq_time.99_5percentile to metadata.csv * Add changelog entry for DataDog#23770 * Remove changelog entry (metadata-only change) * Exercise Envoy listener immediately before E2E check scrape Add a function-scoped exercise_envoy fixture that issues HTTP requests to the listener right before each E2E test reads /stats. Without this, the time between env setup (where the conftest's requests previously lived) and the agent's check invocation can span multiple of Envoy's 5s flush windows, by which point the histogram interval values have been reset to nan and the parser silently drops them. Also temporarily drop the metadata entry for envoy.vhost.vcluster.upstream_rq_time.99_5percentile to confirm CI now reliably catches missing metadata. * Restore conftest warm-up requests for integration tests The integration test (test_check) relies on Envoy having processed traffic before the check runs to assert metrics like envoy.cluster.ext_authz.error.count. Keep the dd_environment warm-up requests for that and have exercise_envoy re-fire just before each E2E scrape. * Use exercise_envoy fixture for integration tests too Move the Envoy listener warm-up out of dd_environment and into the function-scoped exercise_envoy fixture so it's shared by both the integration tests (which previously relied on a side-effect inside dd_environment) and the E2E tests. Single source of truth for "make sure Envoy has traffic before this test runs." * Wait for an Envoy stats flush after exercising the listener Firing the requests immediately before the agent's scrape isn't enough — Envoy only rolls samples into the histogram interval view at each 5s flush, and the parser drops percentiles whose interval value is nan. Sleep 6s so the scrape lands after the flush that captured the samples but before the next empty flush resets them. * Add envoy.vhost.vcluster.upstream_rq_time.99_5percentile to metadata.csv Envoy 1.14+ emits a 99.5th percentile by default for all histograms, including vhost.vcluster.upstream_rq_time. The other upstream_rq_time families (cluster, cluster.external, etc.) already carry this entry; this one was overlooked when those were added. * Drive continuous traffic for one full Envoy flush interval The previous single burst + 6s sleep relied on Envoy's flush cycle aligning with the test's request time. While that landed in the safe window in practice, the alignment isn't designed — it depends on docker_run timing happening to be a multiple of the flush interval. Spreading requests across the window removes that dependency: the most recent completed flush always has samples, so the interval percentiles are never reset to nan. * Temporarily remove 99_5percentile metadata to validate continuous-load fixture * Derive exercise_envoy timings from a flush-interval constant * Restore envoy.vhost.vcluster.upstream_rq_time.99_5percentile in metadata.csv * Document safe-scrape budget of exercise_envoy * Move exercise_envoy to a background thread Replace the synchronous loop+sleep fixture with a threading.Thread + Event so requests keep firing through the entire test, including while the agent's check is in flight. This removes the finite "safe scrape window" the previous approach relied on — every flush window during the test, including those that close mid-scrape, now has samples. Also drop the 99_5percentile metadata entry temporarily to validate the fixture continues to reliably trigger emission on master CI. * Restore envoy.vhost.vcluster.upstream_rq_time.99_5percentile in metadata.csv
diff --git a/envoy/metadata.csv b/envoy/metadata.csv
@@ -801,6 +801,7 @@ envoy.vhost.vcluster.upstream_rq_time.75percentile,gauge,,millisecond,,[Legacy]
 envoy.vhost.vcluster.upstream_rq_time.90percentile,gauge,,millisecond,,[Legacy] Request time milliseconds 90-percentile,-1,envoy,,
 envoy.vhost.vcluster.upstream_rq_time.95percentile,gauge,,millisecond,,[Legacy] Request time milliseconds 95-percentile,-1,envoy,,
 envoy.vhost.vcluster.upstream_rq_time.99percentile,gauge,,millisecond,,[Legacy] Request time milliseconds 99-percentile,-1,envoy,,
+envoy.vhost.vcluster.upstream_rq_time.99_5percentile,gauge,,millisecond,,[Legacy] Request time milliseconds 99.5-percentile,-1,envoy,,
 envoy.vhost.vcluster.upstream_rq_time.99_9percentile,gauge,,millisecond,,[Legacy] Request time milliseconds 99.9-percentile,-1,envoy,,
 envoy.vhost.vcluster.upstream_rq_time.100percentile,gauge,,millisecond,,[Legacy] Request time milliseconds 100-percentile,-1,envoy,,
 envoy.http.dynamodb.operation.upstream_rq_time.0percentile,gauge,,millisecond,,[Legacy] Time spent on operation_name tag 0-percentile,-1,envoy,,
diff --git a/envoy/tests/conftest.py b/envoy/tests/conftest.py
@@ -3,6 +3,8 @@
 # Licensed under a 3-clause BSD style license (see LICENSE)
 import copy
 import os
+import threading
+import time
 
 import pytest
 import requests
@@ -13,6 +15,13 @@
 from .common import DEFAULT_INSTANCE, DOCKER_DIR, FIXTURE_DIR, HOST, URL
 from .legacy.common import FLAVOR, INSTANCES
 
+# Envoy's default stats_flush_interval (seconds). The exercise_envoy
+# fixture drives traffic for one full interval so the most recent
+# completed flush window always has samples; if Envoy's default changes
+# (or we ever set the interval explicitly in the test bootstrap config),
+# update this constant and the fixture timings follow.
+ENVOY_STATS_FLUSH_INTERVAL = 5
+
 
 @pytest.fixture(scope='session')
 def fixture_path():
@@ -35,12 +44,42 @@ def dd_environment():
         attempts=5,
         attempts_wait=10,
     ):
-        # Exercising envoy a bit will trigger extra metrics
-        requests.get('http://{}:8000/service/1'.format(HOST))
-        requests.get('http://{}:8000/service/2'.format(HOST))
         yield instance
 
 
+@pytest.fixture
+def exercise_envoy():
+    # Drive continuous traffic through Envoy's listener for the entire
+    # lifetime of the test. A background thread keeps firing requests
+    # until the fixture tears down, so every flush window — including
+    # those that close while the agent's check is in flight — has
+    # samples. Envoy's text /stats endpoint reports per-interval
+    # quantile values that get recomputed on every flush; an empty
+    # flush resets the interval percentiles to nan (see
+    # hist_approx_quantile in libcircllhist), which the parser would
+    # then filter out.
+    stop = threading.Event()
+
+    def fire_loop():
+        while not stop.is_set():
+            try:
+                requests.get('http://{}:8000/service/1'.format(HOST))
+                requests.get('http://{}:8000/service/2'.format(HOST))
+            except requests.RequestException:
+                pass
+            stop.wait(ENVOY_STATS_FLUSH_INTERVAL / 10)
+
+    thread = threading.Thread(target=fire_loop, daemon=True)
+    thread.start()
+    # Wait one full flush interval so the first non-empty flush rolls
+    # samples into the interval percentile view before the test body
+    # starts scraping.
+    time.sleep(ENVOY_STATS_FLUSH_INTERVAL + 1)
+    yield
+    stop.set()
+    thread.join(timeout=2)
+
+
 @pytest.fixture
 def check():
     return lambda instance: Envoy('envoy', {}, [instance])
diff --git a/envoy/tests/legacy/test_e2e.py b/envoy/tests/legacy/test_e2e.py
@@ -282,7 +282,7 @@
 ]
 
 
-def test_e2e(dd_agent_check):
+def test_e2e(dd_agent_check, exercise_envoy):
     instance = {"stats_url": "http://{}:8001/stats".format(HOST)}
     aggregator = dd_agent_check(instance, rate=True)
     for metric in METRICS:
diff --git a/envoy/tests/legacy/test_integration.py b/envoy/tests/legacy/test_integration.py
@@ -12,7 +12,11 @@
 CHECK_NAME = 'envoy'
 UNIQUE_METRICS = EXT_AUTHZ_METRICS + RBAC_METRICS
 
-pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment'), pytest.mark.flaky]
+pytestmark = [
+    pytest.mark.integration,
+    pytest.mark.usefixtures('dd_environment', 'exercise_envoy'),
+    pytest.mark.flaky,
+]
 
 
 def test_success(aggregator, check, dd_run_check):
diff --git a/envoy/tests/test_e2e.py b/envoy/tests/test_e2e.py
@@ -21,7 +21,7 @@
 
 
 @pytest.mark.e2e
-def test_e2e(dd_agent_check):
+def test_e2e(dd_agent_check, exercise_envoy):
     aggregator = dd_agent_check(DEFAULT_INSTANCE, rate=True)
 
     for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CONNECTION_LIMIT_METRICS + TLS_INSPECTOR_METRICS:
diff --git a/envoy/tests/test_integration.py b/envoy/tests/test_integration.py
@@ -22,7 +22,7 @@
 pytestmark = [
     requires_new_environment,
     pytest.mark.integration,
-    pytest.mark.usefixtures('dd_environment'),
+    pytest.mark.usefixtures('dd_environment', 'exercise_envoy'),
     pytest.mark.flaky,
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -282,7 +282,7 @@`
`282`	`282`	`]`
`283`	`283`
`284`	`284`
`285`		`-def test_e2e(dd_agent_check):`
	`285`	`+def test_e2e(dd_agent_check, exercise_envoy):`
`286`	`286`	`instance = {"stats_url": "http://{}:8001/stats".format(HOST)}`
`287`	`287`	`aggregator = dd_agent_check(instance, rate=True)`
`288`	`288`	`for metric in METRICS:`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`pytestmark = [`
`23`	`23`	`requires_new_environment,`
`24`	`24`	`pytest.mark.integration,`
`25`		`- pytest.mark.usefixtures('dd_environment'),`
	`25`	`+ pytest.mark.usefixtures('dd_environment', 'exercise_envoy'),`
`26`	`26`	`pytest.mark.flaky,`
`27`	`27`	`]`
`28`	`28`