From af9dc8a17a4a3761d231903f21d02a69be55f587 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 26 May 2026 16:08:52 +0000
Subject: [PATCH 01/23] Added
 RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS to control
 inclusion of high-cardinality tags from controller metrics

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 doc/source/serve/monitoring.md                |  5 ++
 python/ray/serve/_private/constants.py        |  8 +++
 python/ray/serve/_private/controller.py       | 51 +++++++++++++-----
 .../ray/serve/tests/unit/test_controller.py   | 54 ++++++++++++++++++-
 4 files changed, 104 insertions(+), 14 deletions(-)

diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
index 1a4cc09b3d06..cf790b74d97b 100644
--- a/doc/source/serve/monitoring.md
+++ b/doc/source/serve/monitoring.md
@@ -755,6 +755,11 @@ These lifecycle **histograms** use `deployment` and `application` labels only—
 
 These metrics provide visibility into autoscaling behavior and help debug scaling issues.
 
+By default, controller-emitted metrics include source identifiers such as `handle`
+and `replica` where applicable. For large deployments, set
+`RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` to drop those
+source-level high-cardinality tags while retaining `deployment` and `application`.
+
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
 | `ray_serve_autoscaling_target_replicas` | Gauge | `deployment`, `application` | Target number of replicas the autoscaler is trying to reach. Compare with actual replicas to identify scaling lag. |
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 1d3a2d36e2f8..974a2f8c4c6d 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -999,6 +999,14 @@
 RAY_SERVE_AGGREGATE_METRICS_AT_CONTROLLER = get_env_bool(
     "RAY_SERVE_AGGREGATE_METRICS_AT_CONTROLLER", "0"
 )
+
+# Feature flag to include high-cardinality source tags on Serve controller metrics.
+# Disable this to keep deployment/application tags while dropping source identifiers
+# like replica IDs and handle IDs from controller-emitted metrics.
+RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS = get_env_bool(
+    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS", "1"
+)
+
 # Feature flag to use compact (low-cardinality) namespace tags on long poll metrics.
 # When enabled, metric tags use only the LongPollNamespace enum name
 # (e.g., "DEPLOYMENT_CONFIG") instead of the full key string which includes
diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index 5e473f0f2375..1a1fe972d851 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -36,6 +36,7 @@
 from ray.serve._private.constants import (
     CONTROL_LOOP_INTERVAL_S,
     RAY_SERVE_CONTROLLER_CALLBACK_IMPORT_PATH,
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
     RAY_SERVE_ENABLE_HA_PROXY,
     RAY_SERVE_LOG_TO_STDERR,
@@ -114,6 +115,32 @@
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
+_AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS = ("deployment", "application")
+
+
+def _get_autoscaling_metrics_delay_tag_keys(
+    high_cardinality_tag_keys: Tuple[str, ...],
+) -> Tuple[str, ...]:
+    """Return tag keys for controller autoscaling metrics delay gauges."""
+    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+        return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS + high_cardinality_tag_keys
+    return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS
+
+
+def _get_autoscaling_metrics_delay_tags(
+    deployment_id: DeploymentID,
+    high_cardinality_tags: Dict[str, str],
+) -> Dict[str, str]:
+    """Return tags for controller autoscaling metrics delay gauge updates."""
+    tags = {
+        "deployment": deployment_id.name,
+        "application": deployment_id.app_name,
+    }
+    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+        tags.update(high_cardinality_tags)
+    return tags
+
+
 # Used for testing purposes only. If this is set, the controller will crash
 # after writing each checkpoint with the specified probability.
 _CRASH_AFTER_CHECKPOINT_PROBABILITY = 0
@@ -366,11 +393,10 @@ def record_autoscaling_metrics_from_replica(
         # Record the metrics delay for observability
         self.replica_metrics_delay_gauge.set(
             latency_ms,
-            tags={
-                "deployment": replica_metric_report.replica_id.deployment_id.name,
-                "application": replica_metric_report.replica_id.deployment_id.app_name,
-                "replica": replica_metric_report.replica_id.unique_id,
-            },
+            tags=_get_autoscaling_metrics_delay_tags(
+                replica_metric_report.replica_id.deployment_id,
+                {"replica": replica_metric_report.replica_id.unique_id},
+            ),
         )
         # Track in health metrics
         self._health_metrics_tracker.record_replica_metrics_delay(latency_ms)
@@ -388,11 +414,10 @@ def record_autoscaling_metrics_from_handle(
         # Record the metrics delay for observability
         self.handle_metrics_delay_gauge.set(
             latency_ms,
-            tags={
-                "deployment": handle_metric_report.deployment_id.name,
-                "application": handle_metric_report.deployment_id.app_name,
-                "handle": handle_metric_report.handle_id,
-            },
+            tags=_get_autoscaling_metrics_delay_tags(
+                handle_metric_report.deployment_id,
+                {"handle": handle_metric_report.handle_id},
+            ),
         )
         # Track in health metrics
         self._health_metrics_tracker.record_handle_metrics_delay(latency_ms)
@@ -759,7 +784,7 @@ def _create_control_loop_metrics(self):
                 "Time taken for the replica metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=("deployment", "application", "replica"),
+            tag_keys=_get_autoscaling_metrics_delay_tag_keys(("replica",)),
         )
         self.handle_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_handle_metrics_delay_ms",
@@ -767,7 +792,7 @@ def _create_control_loop_metrics(self):
                 "Time taken for the handle metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=("deployment", "application", "handle"),
+            tag_keys=_get_autoscaling_metrics_delay_tag_keys(("handle",)),
         )
         self.async_inference_task_queue_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_async_inference_task_queue_metrics_delay_ms",
@@ -775,7 +800,7 @@ def _create_control_loop_metrics(self):
                 "Time taken for the async inference task queue metrics to be reported "
                 "to the controller. High values may indicate a busy controller."
             ),
-            tag_keys=("deployment", "application"),
+            tag_keys=_AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS,
         )
 
     def _recover_state_from_checkpoint(self):
diff --git a/python/ray/serve/tests/unit/test_controller.py b/python/ray/serve/tests/unit/test_controller.py
index faf6b020e2c2..0841880fec0e 100644
--- a/python/ray/serve/tests/unit/test_controller.py
+++ b/python/ray/serve/tests/unit/test_controller.py
@@ -2,8 +2,10 @@
 
 import pytest
 
-from ray.serve._private.common import TargetCapacityDirection
+from ray.serve._private.common import DeploymentID, TargetCapacityDirection
 from ray.serve._private.controller import (
+    _get_autoscaling_metrics_delay_tag_keys,
+    _get_autoscaling_metrics_delay_tags,
     applications_match,
     calculate_target_capacity_direction,
 )
@@ -20,6 +22,56 @@
 )
 
 
+def test_autoscaling_metrics_delay_tags_include_high_cardinality_by_default(
+    monkeypatch,
+):
+    from ray.serve._private import controller
+
+    monkeypatch.setattr(
+        controller,
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
+        True,
+    )
+
+    assert _get_autoscaling_metrics_delay_tag_keys(("handle",)) == (
+        "deployment",
+        "application",
+        "handle",
+    )
+    assert _get_autoscaling_metrics_delay_tags(
+        DeploymentID(name="deployment", app_name="application"),
+        {"handle": "handle-id"},
+    ) == {
+        "deployment": "deployment",
+        "application": "application",
+        "handle": "handle-id",
+    }
+
+
+def test_autoscaling_metrics_delay_tags_exclude_high_cardinality(
+    monkeypatch,
+):
+    from ray.serve._private import controller
+
+    monkeypatch.setattr(
+        controller,
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
+        False,
+    )
+
+    assert _get_autoscaling_metrics_delay_tag_keys(("replica",)) == (
+        "deployment",
+        "application",
+    )
+    assert _get_autoscaling_metrics_delay_tags(
+        DeploymentID(name="deployment", app_name="application"),
+        {"replica": "replica-id"},
+    ) == {
+        "deployment": "deployment",
+        "application": "application",
+    }
+
+
 def create_app_config(name: str) -> ServeApplicationSchema:
     return ServeApplicationSchema(
         name=name, import_path=f"fake.{name}", route_prefix=f"/{name}"

From 258508380f150260f9814fd28079ec2723082969 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 26 May 2026 16:19:46 +0000
Subject: [PATCH 02/23] Addressed cursor comments

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/controller.py        | 11 +++++++----
 python/ray/serve/tests/unit/test_controller.py |  6 ++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index 1a1fe972d851..85c5f311f8f3 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -129,7 +129,8 @@ def _get_autoscaling_metrics_delay_tag_keys(
 
 def _get_autoscaling_metrics_delay_tags(
     deployment_id: DeploymentID,
-    high_cardinality_tags: Dict[str, str],
+    high_cardinality_tag_key: str,
+    high_cardinality_tag_value: str,
 ) -> Dict[str, str]:
     """Return tags for controller autoscaling metrics delay gauge updates."""
     tags = {
@@ -137,7 +138,7 @@ def _get_autoscaling_metrics_delay_tags(
         "application": deployment_id.app_name,
     }
     if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        tags.update(high_cardinality_tags)
+        tags[high_cardinality_tag_key] = high_cardinality_tag_value
     return tags
 
 
@@ -395,7 +396,8 @@ def record_autoscaling_metrics_from_replica(
             latency_ms,
             tags=_get_autoscaling_metrics_delay_tags(
                 replica_metric_report.replica_id.deployment_id,
-                {"replica": replica_metric_report.replica_id.unique_id},
+                "replica",
+                replica_metric_report.replica_id.unique_id,
             ),
         )
         # Track in health metrics
@@ -416,7 +418,8 @@ def record_autoscaling_metrics_from_handle(
             latency_ms,
             tags=_get_autoscaling_metrics_delay_tags(
                 handle_metric_report.deployment_id,
-                {"handle": handle_metric_report.handle_id},
+                "handle",
+                handle_metric_report.handle_id,
             ),
         )
         # Track in health metrics
diff --git a/python/ray/serve/tests/unit/test_controller.py b/python/ray/serve/tests/unit/test_controller.py
index 0841880fec0e..56bcd6d73268 100644
--- a/python/ray/serve/tests/unit/test_controller.py
+++ b/python/ray/serve/tests/unit/test_controller.py
@@ -40,7 +40,8 @@ def test_autoscaling_metrics_delay_tags_include_high_cardinality_by_default(
     )
     assert _get_autoscaling_metrics_delay_tags(
         DeploymentID(name="deployment", app_name="application"),
-        {"handle": "handle-id"},
+        "handle",
+        "handle-id",
     ) == {
         "deployment": "deployment",
         "application": "application",
@@ -65,7 +66,8 @@ def test_autoscaling_metrics_delay_tags_exclude_high_cardinality(
     )
     assert _get_autoscaling_metrics_delay_tags(
         DeploymentID(name="deployment", app_name="application"),
-        {"replica": "replica-id"},
+        "replica",
+        "replica-id",
     ) == {
         "deployment": "deployment",
         "application": "application",

From 2b7012a5fe74b36a11ac7e85fb54fd22432bfd56 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 2 Jun 2026 14:44:32 +0000
Subject: [PATCH 03/23] Also classify lifecycle and autoscaling replica metrics
 as high-card

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 doc/source/serve/monitoring.md                |  9 ++-
 python/ray/serve/_private/deployment_state.py | 31 ++++++-
 python/ray/serve/_private/replica.py          | 53 +++++++++++-
 .../serve/tests/unit/test_deployment_state.py | 81 +++++++++++++++++++
 4 files changed, 164 insertions(+), 10 deletions(-)

diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
index cf790b74d97b..210cf015b340 100644
--- a/doc/source/serve/monitoring.md
+++ b/doc/source/serve/monitoring.md
@@ -740,6 +740,11 @@ These metrics track replica health, restarts, and lifecycle timing.
 These lifecycle **histograms** use `deployment` and `application` labels only—no `replica` label—so Prometheus cardinality stays manageable at scale.
 :::
 
+By default, replica lifecycle metrics include source identifiers such as
+`replica` where applicable. For large deployments, set
+`RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` to drop those
+source-level high-cardinality tags while retaining `deployment` and `application`.
+
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
 | `ray_serve_deployment_replica_healthy` | Gauge | `deployment`, `replica`, `application` | Health status of the replica: `1` = healthy, `0` = unhealthy. |
@@ -755,8 +760,8 @@ These lifecycle **histograms** use `deployment` and `application` labels only—
 
 These metrics provide visibility into autoscaling behavior and help debug scaling issues.
 
-By default, controller-emitted metrics include source identifiers such as `handle`
-and `replica` where applicable. For large deployments, set
+By default, autoscaling metrics include source identifiers such as `handle` and
+`replica` where applicable. For large deployments, set
 `RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` to drop those
 source-level high-cardinality tags while retaining `deployment` and `application`.
 
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 583218514354..38da6574f21e 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -49,6 +49,7 @@
     DEPLOYMENT_ACTOR_HEALTH_CHECK_TIMEOUT_S,
     DEPLOYMENT_ACTOR_HEALTH_CHECK_UNHEALTHY_THRESHOLD,
     MAX_PER_REPLICA_RETRY_COUNT,
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
     RAY_SERVE_DIRECT_INGRESS_MIN_DRAINING_PERIOD_S,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
     RAY_SERVE_ENABLE_TASK_EVENTS,
@@ -105,6 +106,23 @@
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
+_REPLICA_LIFECYCLE_METRIC_BASE_TAG_KEYS = ("deployment", "application")
+
+
+def _get_replica_lifecycle_metric_tag_keys() -> Tuple[str, ...]:
+    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+        return ("deployment", "replica", "application")
+    return _REPLICA_LIFECYCLE_METRIC_BASE_TAG_KEYS
+
+
+def _get_replica_lifecycle_metric_tags(
+    replica_unique_id: str,
+) -> Optional[Dict[str, str]]:
+    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+        return {"replica": replica_unique_id}
+    return None
+
+
 _RESERVED_INTERNAL_DEPLOYMENT_CONTEXT_ENV_VARS = {
     RAY_SERVE_INTERNAL_DEPLOYMENT_APP_NAME_ENV_VAR,
     RAY_SERVE_INTERNAL_DEPLOYMENT_NAME_ENV_VAR,
@@ -2882,7 +2900,7 @@ def __init__(
                 "Tracks whether this deployment replica is healthy. 1 means "
                 "healthy, 0 means unhealthy."
             ),
-            tag_keys=("deployment", "replica", "application"),
+            tag_keys=_get_replica_lifecycle_metric_tag_keys(),
         )
         self.health_check_gauge.set_default_tags(
             {"deployment": self._id.name, "application": self._id.app_name}
@@ -2937,7 +2955,7 @@ def __init__(
         self.health_check_failures_counter = metrics.Counter(
             "serve_health_check_failures_total",
             description=("Count of failed health checks."),
-            tag_keys=("deployment", "replica", "application"),
+            tag_keys=_get_replica_lifecycle_metric_tag_keys(),
         )
         self.health_check_failures_counter.set_default_tags(
             {"deployment": self._id.name, "application": self._id.app_name}
@@ -4358,7 +4376,10 @@ def _set_health_gauge(self, replica_unique_id: str, value: int) -> None:
             and (now - cached[1]) < RAY_SERVE_STATUS_GAUGE_REPORT_INTERVAL_S
         ):
             return
-        self.health_check_gauge.set(value, tags={"replica": replica_unique_id})
+        self.health_check_gauge.set(
+            value,
+            tags=_get_replica_lifecycle_metric_tags(replica_unique_id),
+        )
         self._health_gauge_cache[replica_unique_id] = (value, now)
 
     def _register_gang_replica(self, replica_id: ReplicaID, gang_id: str) -> None:
@@ -4525,7 +4546,9 @@ def check_and_update_replicas(self):
                 )
             if replica.last_health_check_failed:
                 self.health_check_failures_counter.inc(
-                    tags={"replica": replica.replica_id.unique_id}
+                    tags=_get_replica_lifecycle_metric_tags(
+                        replica.replica_id.unique_id
+                    )
                 )
 
             if is_healthy:
diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py
index d1191d41b3f5..f2cd39ef5f5a 100644
--- a/python/ray/serve/_private/replica.py
+++ b/python/ray/serve/_private/replica.py
@@ -63,6 +63,7 @@
     HEALTHY_MESSAGE,
     RAY_SERVE_AUTOSCALING_METRIC_RECORD_INTERVAL_FACTOR,
     RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE,
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
     RAY_SERVE_DIRECT_INGRESS_MIN_DRAINING_PERIOD_S,
     RAY_SERVE_DIRECT_INGRESS_PORT_RETRY_COUNT,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
@@ -190,6 +191,30 @@
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
+_REPLICA_METRIC_BASE_TAG_KEYS = ("deployment", "application")
+
+
+def _get_replica_metric_default_tag_keys(
+    additional_tag_keys: Tuple[str, ...] = tuple(),
+) -> Tuple[str, ...]:
+    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+        return ("deployment", "replica", "application") + additional_tag_keys
+    return _REPLICA_METRIC_BASE_TAG_KEYS + additional_tag_keys
+
+
+def _get_replica_metric_default_tags(
+    deployment_id: DeploymentID,
+    replica_unique_id: str,
+) -> Dict[str, str]:
+    tags = {
+        "deployment": deployment_id.name,
+        "application": deployment_id.app_name,
+    }
+    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+        tags["replica"] = replica_unique_id
+    return tags
+
+
 SERVE_BUILD_ASGI_APP_METHOD = "__serve_build_asgi_app__"
 
 
@@ -363,11 +388,18 @@ def __init__(
         self._cached_metrics_interval_s = RAY_SERVE_METRICS_EXPORT_INTERVAL_MS / 1000
 
         # Request counter (only set on replica startup).
-        self._restart_counter = metrics.Counter(
+        self._restart_counter = ray_metrics.Counter(
             "serve_deployment_replica_starts",
             description=(
                 "The number of times this replica has been restarted due to failure."
             ),
+            tag_keys=_get_replica_metric_default_tag_keys(),
+        )
+        self._restart_counter.set_default_tags(
+            _get_replica_metric_default_tags(
+                self._deployment_id,
+                self._replica_id.unique_id,
+            )
         )
         self._restart_counter.inc()
 
@@ -426,19 +458,32 @@ def __init__(
             description="The current number of queries being processed.",
         )
 
-        self.record_autoscaling_stats_failed_counter = metrics.Counter(
+        self.record_autoscaling_stats_failed_counter = ray_metrics.Counter(
             "serve_record_autoscaling_stats_failed",
-            tag_keys=("exception_name",),
+            tag_keys=_get_replica_metric_default_tag_keys(("exception_name",)),
             description="The number of errored record_autoscaling_stats invocations.",
         )
+        self.record_autoscaling_stats_failed_counter.set_default_tags(
+            _get_replica_metric_default_tags(
+                self._deployment_id,
+                self._replica_id.unique_id,
+            )
+        )
 
-        self.user_autoscaling_stats_latency_tracker = metrics.Histogram(
+        self.user_autoscaling_stats_latency_tracker = ray_metrics.Histogram(
             "serve_user_autoscaling_stats_latency_ms",
             description=(
                 "Time taken to execute the user-defined autoscaling stats function "
                 "in milliseconds."
             ),
             boundaries=REQUEST_LATENCY_BUCKETS_MS,
+            tag_keys=_get_replica_metric_default_tag_keys(),
+        )
+        self.user_autoscaling_stats_latency_tracker.set_default_tags(
+            _get_replica_metric_default_tags(
+                self._deployment_id,
+                self._replica_id.unique_id,
+            )
         )
 
         # Replica utilization tracking with rolling window.
diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py
index 1269d1083e65..0bc5bc2a9d7c 100644
--- a/python/ray/serve/tests/unit/test_deployment_state.py
+++ b/python/ray/serve/tests/unit/test_deployment_state.py
@@ -36,6 +36,8 @@
     RAY_SERVE_INTERNAL_DEPLOYMENT_NAME_ENV_VAR,
     RAY_SERVE_STATUS_GAUGE_REPORT_INTERVAL_S,
 )
+from ray.serve._private import deployment_state as deployment_state_module
+from ray.serve._private import replica as replica_module
 from ray.serve._private.deployment_info import DeploymentInfo
 from ray.serve._private.deployment_state import (
     ALL_REPLICA_STATES,
@@ -51,6 +53,8 @@
     DeploymentVersion,
     ReplicaStartupStatus,
     ReplicaStateContainer,
+    _get_replica_lifecycle_metric_tag_keys,
+    _get_replica_lifecycle_metric_tags,
 )
 from ray.serve._private.exceptions import DeploymentIsBeingDeletedError
 from ray.serve._private.long_poll import LongPollNamespace
@@ -65,6 +69,10 @@
     get_capacity_adjusted_num_replicas,
     get_random_string,
 )
+from ray.serve._private.replica import (
+    _get_replica_metric_default_tag_keys,
+    _get_replica_metric_default_tags,
+)
 from ray.serve.config import DeploymentActorConfig, GangSchedulingConfig
 from ray.serve.schema import ReplicaRank
 from ray.util.placement_group import validate_placement_group
@@ -73,6 +81,79 @@
 TEST_DEPLOYMENT_ID_2 = DeploymentID(name="test_deployment_2", app_name="test_app")
 
 
+def test_replica_lifecycle_metric_tags_include_high_cardinality_by_default(
+    monkeypatch,
+):
+    monkeypatch.setattr(
+        deployment_state_module,
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
+        True,
+    )
+
+    assert _get_replica_lifecycle_metric_tag_keys() == (
+        "deployment",
+        "replica",
+        "application",
+    )
+    assert _get_replica_lifecycle_metric_tags("replica-id") == {
+        "replica": "replica-id"
+    }
+
+
+def test_replica_lifecycle_metric_tags_exclude_high_cardinality(monkeypatch):
+    monkeypatch.setattr(
+        deployment_state_module,
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
+        False,
+    )
+
+    assert _get_replica_lifecycle_metric_tag_keys() == (
+        "deployment",
+        "application",
+    )
+    assert _get_replica_lifecycle_metric_tags("replica-id") is None
+
+
+def test_replica_metric_default_tags_include_high_cardinality_by_default(
+    monkeypatch,
+):
+    monkeypatch.setattr(
+        replica_module,
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
+        True,
+    )
+
+    assert _get_replica_metric_default_tag_keys(("exception_name",)) == (
+        "deployment",
+        "replica",
+        "application",
+        "exception_name",
+    )
+    assert _get_replica_metric_default_tags(TEST_DEPLOYMENT_ID, "replica-id") == {
+        "deployment": "test_deployment",
+        "application": "test_app",
+        "replica": "replica-id",
+    }
+
+
+def test_replica_metric_default_tags_exclude_high_cardinality(monkeypatch):
+    monkeypatch.setattr(
+        replica_module,
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
+        False,
+    )
+
+    assert _get_replica_metric_default_tag_keys(("exception_name",)) == (
+        "deployment",
+        "application",
+        "exception_name",
+    )
+    assert _get_replica_metric_default_tags(TEST_DEPLOYMENT_ID, "replica-id") == {
+        "deployment": "test_deployment",
+        "application": "test_app",
+    }
+
+
 def deployment_info(
     version: Optional[str] = None,
     num_replicas: Optional[int] = 1,

From 07a863581f179a44f1f0a348faf3546658980213 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 2 Jun 2026 14:47:26 +0000
Subject: [PATCH 04/23] Lint updates

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 .../ray/serve/tests/unit/test_deployment_state.py  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py
index 0bc5bc2a9d7c..bdf651693bb7 100644
--- a/python/ray/serve/tests/unit/test_deployment_state.py
+++ b/python/ray/serve/tests/unit/test_deployment_state.py
@@ -7,6 +7,10 @@
 
 from ray._common.ray_constants import DEFAULT_MAX_CONCURRENCY_ASYNC
 from ray._raylet import NodeID
+from ray.serve._private import (
+    deployment_state as deployment_state_module,
+    replica as replica_module,
+)
 from ray.serve._private.autoscaling_state import AutoscalingStateManager
 from ray.serve._private.common import (
     RUNNING_REQUESTS_KEY,
@@ -36,8 +40,6 @@
     RAY_SERVE_INTERNAL_DEPLOYMENT_NAME_ENV_VAR,
     RAY_SERVE_STATUS_GAUGE_REPORT_INTERVAL_S,
 )
-from ray.serve._private import deployment_state as deployment_state_module
-from ray.serve._private import replica as replica_module
 from ray.serve._private.deployment_info import DeploymentInfo
 from ray.serve._private.deployment_state import (
     ALL_REPLICA_STATES,
@@ -58,6 +60,10 @@
 )
 from ray.serve._private.exceptions import DeploymentIsBeingDeletedError
 from ray.serve._private.long_poll import LongPollNamespace
+from ray.serve._private.replica import (
+    _get_replica_metric_default_tag_keys,
+    _get_replica_metric_default_tags,
+)
 from ray.serve._private.test_utils import (
     MockDeploymentActorWrapper,
     MockPlacementGroup,
@@ -69,10 +75,6 @@
     get_capacity_adjusted_num_replicas,
     get_random_string,
 )
-from ray.serve._private.replica import (
-    _get_replica_metric_default_tag_keys,
-    _get_replica_metric_default_tags,
-)
 from ray.serve.config import DeploymentActorConfig, GangSchedulingConfig
 from ray.serve.schema import ReplicaRank
 from ray.util.placement_group import validate_placement_group

From 4cc014480b366b6d6bf7de968255bd08359a04fc Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 2 Jun 2026 14:48:26 +0000
Subject: [PATCH 05/23] Lint updates

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/tests/unit/test_deployment_state.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py
index bdf651693bb7..56e1d2cbd029 100644
--- a/python/ray/serve/tests/unit/test_deployment_state.py
+++ b/python/ray/serve/tests/unit/test_deployment_state.py
@@ -97,9 +97,7 @@ def test_replica_lifecycle_metric_tags_include_high_cardinality_by_default(
         "replica",
         "application",
     )
-    assert _get_replica_lifecycle_metric_tags("replica-id") == {
-        "replica": "replica-id"
-    }
+    assert _get_replica_lifecycle_metric_tags("replica-id") == {"replica": "replica-id"}
 
 
 def test_replica_lifecycle_metric_tags_exclude_high_cardinality(monkeypatch):

From ccd39e788b331358aece2ae4092447dbf451df52 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Wed, 3 Jun 2026 17:40:56 +0000
Subject: [PATCH 06/23] Addressed review comments, simplified/removed code

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 doc/source/serve/monitoring.md                |  13 +-
 python/ray/serve/_private/controller.py       |   8 +-
 python/ray/serve/_private/deployment_state.py |  44 +++---
 python/ray/serve/_private/replica.py          |  53 +------
 python/ray/serve/tests/conftest.py            |  41 ++++--
 python/ray/serve/tests/test_metrics.py        | 138 ++++++++++++++++++
 .../ray/serve/tests/unit/test_controller.py   |  56 +------
 .../serve/tests/unit/test_deployment_state.py |  81 ----------
 8 files changed, 201 insertions(+), 233 deletions(-)

diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
index 210cf015b340..defa0a2c7dbd 100644
--- a/doc/source/serve/monitoring.md
+++ b/doc/source/serve/monitoring.md
@@ -740,10 +740,12 @@ These metrics track replica health, restarts, and lifecycle timing.
 These lifecycle **histograms** use `deployment` and `application` labels only—no `replica` label—so Prometheus cardinality stays manageable at scale.
 :::
 
-By default, replica lifecycle metrics include source identifiers such as
-`replica` where applicable. For large deployments, set
+By default, controller-emitted replica lifecycle metrics include source
+identifiers such as `replica` where applicable. For large deployments, set
 `RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` to drop those
 source-level high-cardinality tags while retaining `deployment` and `application`.
+This setting doesn't affect replica-emitted metrics such as
+`ray_serve_deployment_replica_starts_total`.
 
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
@@ -760,10 +762,13 @@ source-level high-cardinality tags while retaining `deployment` and `application
 
 These metrics provide visibility into autoscaling behavior and help debug scaling issues.
 
-By default, autoscaling metrics include source identifiers such as `handle` and
-`replica` where applicable. For large deployments, set
+By default, controller-emitted autoscaling delay metrics include source
+identifiers such as `handle` and `replica` where applicable. For large deployments, set
 `RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` to drop those
 source-level high-cardinality tags while retaining `deployment` and `application`.
+This setting doesn't affect replica-emitted metrics such as
+`ray_serve_record_autoscaling_stats_failed_total` or
+`ray_serve_user_autoscaling_stats_latency_ms`.
 
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index 85c5f311f8f3..4e1f1e37598b 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -119,11 +119,11 @@
 
 
 def _get_autoscaling_metrics_delay_tag_keys(
-    high_cardinality_tag_keys: Tuple[str, ...],
+    high_cardinality_tag_key: str,
 ) -> Tuple[str, ...]:
     """Return tag keys for controller autoscaling metrics delay gauges."""
     if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS + high_cardinality_tag_keys
+        return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS + (high_cardinality_tag_key,)
     return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS
 
 
@@ -787,7 +787,7 @@ def _create_control_loop_metrics(self):
                 "Time taken for the replica metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=_get_autoscaling_metrics_delay_tag_keys(("replica",)),
+            tag_keys=_get_autoscaling_metrics_delay_tag_keys("replica"),
         )
         self.handle_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_handle_metrics_delay_ms",
@@ -795,7 +795,7 @@ def _create_control_loop_metrics(self):
                 "Time taken for the handle metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=_get_autoscaling_metrics_delay_tag_keys(("handle",)),
+            tag_keys=_get_autoscaling_metrics_delay_tag_keys("handle"),
         )
         self.async_inference_task_queue_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_async_inference_task_queue_metrics_delay_ms",
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 38da6574f21e..144dc1c99921 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -106,23 +106,6 @@
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
-_REPLICA_LIFECYCLE_METRIC_BASE_TAG_KEYS = ("deployment", "application")
-
-
-def _get_replica_lifecycle_metric_tag_keys() -> Tuple[str, ...]:
-    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        return ("deployment", "replica", "application")
-    return _REPLICA_LIFECYCLE_METRIC_BASE_TAG_KEYS
-
-
-def _get_replica_lifecycle_metric_tags(
-    replica_unique_id: str,
-) -> Optional[Dict[str, str]]:
-    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        return {"replica": replica_unique_id}
-    return None
-
-
 _RESERVED_INTERNAL_DEPLOYMENT_CONTEXT_ENV_VARS = {
     RAY_SERVE_INTERNAL_DEPLOYMENT_APP_NAME_ENV_VAR,
     RAY_SERVE_INTERNAL_DEPLOYMENT_NAME_ENV_VAR,
@@ -2894,13 +2877,19 @@ def __init__(
         # Deployment-scoped actor lifecycle (per deployment)
         self._deployment_actors = DeploymentActorContainer(self._id)
 
+        replica_lifecycle_metric_tag_keys = (
+            ("deployment", "replica", "application")
+            if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS
+            else ("deployment", "application")
+        )
+
         self.health_check_gauge = metrics.Gauge(
             "serve_deployment_replica_healthy",
             description=(
                 "Tracks whether this deployment replica is healthy. 1 means "
                 "healthy, 0 means unhealthy."
             ),
-            tag_keys=_get_replica_lifecycle_metric_tag_keys(),
+            tag_keys=replica_lifecycle_metric_tag_keys,
         )
         self.health_check_gauge.set_default_tags(
             {"deployment": self._id.name, "application": self._id.app_name}
@@ -2955,7 +2944,7 @@ def __init__(
         self.health_check_failures_counter = metrics.Counter(
             "serve_health_check_failures_total",
             description=("Count of failed health checks."),
-            tag_keys=_get_replica_lifecycle_metric_tag_keys(),
+            tag_keys=replica_lifecycle_metric_tag_keys,
         )
         self.health_check_failures_counter.set_default_tags(
             {"deployment": self._id.name, "application": self._id.app_name}
@@ -4376,10 +4365,10 @@ def _set_health_gauge(self, replica_unique_id: str, value: int) -> None:
             and (now - cached[1]) < RAY_SERVE_STATUS_GAUGE_REPORT_INTERVAL_S
         ):
             return
-        self.health_check_gauge.set(
-            value,
-            tags=_get_replica_lifecycle_metric_tags(replica_unique_id),
-        )
+        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            self.health_check_gauge.set(value, tags={"replica": replica_unique_id})
+        else:
+            self.health_check_gauge.set(value)
         self._health_gauge_cache[replica_unique_id] = (value, now)
 
     def _register_gang_replica(self, replica_id: ReplicaID, gang_id: str) -> None:
@@ -4545,11 +4534,12 @@ def check_and_update_replicas(self):
                     replica.last_health_check_latency_ms
                 )
             if replica.last_health_check_failed:
-                self.health_check_failures_counter.inc(
-                    tags=_get_replica_lifecycle_metric_tags(
-                        replica.replica_id.unique_id
+                if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+                    self.health_check_failures_counter.inc(
+                        tags={"replica": replica.replica_id.unique_id}
                     )
-                )
+                else:
+                    self.health_check_failures_counter.inc()
 
             if is_healthy:
                 healthy_replicas.append(replica)
diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py
index f2cd39ef5f5a..d1191d41b3f5 100644
--- a/python/ray/serve/_private/replica.py
+++ b/python/ray/serve/_private/replica.py
@@ -63,7 +63,6 @@
     HEALTHY_MESSAGE,
     RAY_SERVE_AUTOSCALING_METRIC_RECORD_INTERVAL_FACTOR,
     RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE,
-    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
     RAY_SERVE_DIRECT_INGRESS_MIN_DRAINING_PERIOD_S,
     RAY_SERVE_DIRECT_INGRESS_PORT_RETRY_COUNT,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
@@ -191,30 +190,6 @@
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
-_REPLICA_METRIC_BASE_TAG_KEYS = ("deployment", "application")
-
-
-def _get_replica_metric_default_tag_keys(
-    additional_tag_keys: Tuple[str, ...] = tuple(),
-) -> Tuple[str, ...]:
-    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        return ("deployment", "replica", "application") + additional_tag_keys
-    return _REPLICA_METRIC_BASE_TAG_KEYS + additional_tag_keys
-
-
-def _get_replica_metric_default_tags(
-    deployment_id: DeploymentID,
-    replica_unique_id: str,
-) -> Dict[str, str]:
-    tags = {
-        "deployment": deployment_id.name,
-        "application": deployment_id.app_name,
-    }
-    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        tags["replica"] = replica_unique_id
-    return tags
-
-
 SERVE_BUILD_ASGI_APP_METHOD = "__serve_build_asgi_app__"
 
 
@@ -388,18 +363,11 @@ def __init__(
         self._cached_metrics_interval_s = RAY_SERVE_METRICS_EXPORT_INTERVAL_MS / 1000
 
         # Request counter (only set on replica startup).
-        self._restart_counter = ray_metrics.Counter(
+        self._restart_counter = metrics.Counter(
             "serve_deployment_replica_starts",
             description=(
                 "The number of times this replica has been restarted due to failure."
             ),
-            tag_keys=_get_replica_metric_default_tag_keys(),
-        )
-        self._restart_counter.set_default_tags(
-            _get_replica_metric_default_tags(
-                self._deployment_id,
-                self._replica_id.unique_id,
-            )
         )
         self._restart_counter.inc()
 
@@ -458,32 +426,19 @@ def __init__(
             description="The current number of queries being processed.",
         )
 
-        self.record_autoscaling_stats_failed_counter = ray_metrics.Counter(
+        self.record_autoscaling_stats_failed_counter = metrics.Counter(
             "serve_record_autoscaling_stats_failed",
-            tag_keys=_get_replica_metric_default_tag_keys(("exception_name",)),
+            tag_keys=("exception_name",),
             description="The number of errored record_autoscaling_stats invocations.",
         )
-        self.record_autoscaling_stats_failed_counter.set_default_tags(
-            _get_replica_metric_default_tags(
-                self._deployment_id,
-                self._replica_id.unique_id,
-            )
-        )
 
-        self.user_autoscaling_stats_latency_tracker = ray_metrics.Histogram(
+        self.user_autoscaling_stats_latency_tracker = metrics.Histogram(
             "serve_user_autoscaling_stats_latency_ms",
             description=(
                 "Time taken to execute the user-defined autoscaling stats function "
                 "in milliseconds."
             ),
             boundaries=REQUEST_LATENCY_BUCKETS_MS,
-            tag_keys=_get_replica_metric_default_tag_keys(),
-        )
-        self.user_autoscaling_stats_latency_tracker.set_default_tags(
-            _get_replica_metric_default_tags(
-                self._deployment_id,
-                self._replica_id.unique_id,
-            )
         )
 
         # Replica utilization tracking with rolling window.
diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py
index 04647c73238d..30a469e14c48 100644
--- a/python/ray/serve/tests/conftest.py
+++ b/python/ray/serve/tests/conftest.py
@@ -361,19 +361,28 @@ def ready():
 
 @pytest.fixture
 def metrics_start_shutdown(request):
-    param = request.param if hasattr(request, "param") else None
-    request_timeout_s = param if param else None
     """Fixture provides a fresh Ray cluster to prevent metrics state sharing."""
-    wait_for_metrics_port_free()
-    ray.init(
-        _metrics_export_port=TEST_METRICS_EXPORT_PORT,
-        _system_config={
-            "metrics_report_interval_ms": 100,
-            "task_retry_delay_ms": 50,
-        },
-    )
+    param = request.param if hasattr(request, "param") else None
+    if isinstance(param, dict):
+        request_timeout_s = param.get("request_timeout_s")
+        env_vars = param.get("env_vars", {})
+    else:
+        request_timeout_s = param if param else None
+        env_vars = {}
+    old_env_vars = {key: os.environ.get(key) for key in env_vars}
+    for key, value in env_vars.items():
+        os.environ[key] = str(value)
 
     try:
+        wait_for_metrics_port_free()
+        ray.init(
+            _metrics_export_port=TEST_METRICS_EXPORT_PORT,
+            _system_config={
+                "metrics_report_interval_ms": 100,
+                "task_retry_delay_ms": 50,
+            },
+        )
+
         session_name = ray._private.worker._global_node.session_name
         wait_for_metrics_endpoint(session_name)
 
@@ -394,9 +403,15 @@ def metrics_start_shutdown(request):
             ),
         )
     finally:
-        serve.shutdown()
-        ray.shutdown()
-        reset_ray_address()
+        if ray.is_initialized():
+            serve.shutdown()
+            ray.shutdown()
+            reset_ray_address()
+        for key, old_value in old_env_vars.items():
+            if old_value is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = old_value
 
 
 # Helper function to return the node ID of a remote worker.
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index 9ce46b67712b..d3f76fd43acc 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -25,8 +25,18 @@
     fetch_prometheus_metric_timeseries,
     wait_for_condition,
 )
+from ray.serve._private.common import (
+    DeploymentHandleSource,
+    DeploymentID,
+    HandleMetricReport,
+    ReplicaID,
+    ReplicaMetricReport,
+    TimeStampedValue,
+)
 from ray.serve._private.constants import (
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
+    SERVE_CONTROLLER_NAME,
+    SERVE_NAMESPACE,
 )
 from ray.serve._private.test_utils import (
     PROMETHEUS_METRICS_TIMEOUT_S,
@@ -42,6 +52,10 @@
 from ray.serve.config import RequestRouterConfig
 from ray.serve.generated import serve_pb2, serve_pb2_grpc
 
+CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR = (
+    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS"
+)
+
 
 def extract_tags(line: str) -> Dict[str, str]:
     """Extracts any tags from the metrics line."""
@@ -1268,6 +1282,130 @@ def metrics_available():
     ), f"Latency metrics should use route patterns. Found: {latency_routes}"
 
 
+@pytest.mark.parametrize(
+    "metrics_start_shutdown, include_high_cardinality",
+    [
+        (
+            {"env_vars": {CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR: "1"}},
+            True,
+        ),
+        (
+            {"env_vars": {CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR: "0"}},
+            False,
+        ),
+    ],
+    ids=["include_high_cardinality", "exclude_high_cardinality"],
+    indirect=["metrics_start_shutdown"],
+)
+def test_controller_high_cardinality_metric_tags(
+    metrics_start_shutdown, include_high_cardinality
+):
+    """Test controller metrics respect high-cardinality tag config."""
+
+    @serve.deployment(
+        name="autoscaling_metrics_model",
+        health_check_period_s=0.1,
+        health_check_timeout_s=1,
+        autoscaling_config={
+            "min_replicas": 1,
+            "max_replicas": 2,
+            "target_ongoing_requests": 1,
+            "metrics_interval_s": 0.1,
+            "look_back_period_s": 1,
+        },
+    )
+    class Model:
+        def __init__(self):
+            self.should_fail_health_check = False
+
+        async def __call__(self, request: Request):
+            body = await request.body()
+            if body == b"fail_health_check":
+                self.should_fail_health_check = True
+            return "hello"
+
+        async def check_health(self):
+            if self.should_fail_health_check:
+                raise RuntimeError("Intentional health check failure.")
+
+    app_name = "autoscaling_metrics_app"
+    deployment_name = "autoscaling_metrics_model"
+    deployment_id = DeploymentID(deployment_name, app_name)
+    replica_id = ReplicaID("test-replica-id", deployment_id)
+    serve.run(Model.bind(), name=app_name)
+
+    url = get_application_url("HTTP", app_name)
+    assert httpx.get(url).text == "hello"
+    assert httpx.request("GET", url, content=b"fail_health_check").text == "hello"
+
+    controller = ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE)
+    now = time.time()
+    ray.get(
+        [
+            controller.record_autoscaling_metrics_from_handle.remote(
+                HandleMetricReport(
+                    deployment_id=deployment_id,
+                    handle_id="test-handle-id",
+                    actor_id="test-actor-id",
+                    handle_source=DeploymentHandleSource.UNKNOWN,
+                    aggregated_queued_requests=0,
+                    queued_requests=[TimeStampedValue(now, 0)],
+                    aggregated_metrics={},
+                    metrics={},
+                    timestamp=now,
+                )
+            ),
+            controller.record_autoscaling_metrics_from_replica.remote(
+                ReplicaMetricReport(
+                    replica_id=replica_id,
+                    aggregated_metrics={},
+                    metrics={},
+                    timestamp=now,
+                )
+            ),
+        ]
+    )
+
+    timeseries = PrometheusTimeseries()
+
+    def get_matching_metrics(metric_name: str):
+        return [
+            metric
+            for metric in get_metric_dictionaries(
+                metric_name, timeseries=timeseries, wait=False
+            )
+            if metric.get("deployment") == deployment_name
+            and metric.get("application") == app_name
+        ]
+
+    def assert_high_cardinality_tag(metric, tag):
+        assert (tag in metric) is include_high_cardinality
+
+    def check_controller_metric_tags():
+        health_failure_metrics = get_matching_metrics(
+            "ray_serve_health_check_failures_total"
+        )
+        handle_metrics = get_matching_metrics(
+            "ray_serve_autoscaling_handle_metrics_delay_ms"
+        )
+        replica_metrics = get_matching_metrics(
+            "ray_serve_autoscaling_replica_metrics_delay_ms"
+        )
+        if not health_failure_metrics or not handle_metrics or not replica_metrics:
+            return False
+
+        for metric in health_failure_metrics:
+            assert_high_cardinality_tag(metric, "replica")
+        for metric in handle_metrics:
+            assert_high_cardinality_tag(metric, "handle")
+        for metric in replica_metrics:
+            assert_high_cardinality_tag(metric, "replica")
+
+        return True
+
+    wait_for_condition(check_controller_metric_tags, timeout=60)
+
+
 def test_routing_stats_delay_metric(metrics_start_shutdown):
     """Test that routing stats delay metric is reported correctly."""
 
diff --git a/python/ray/serve/tests/unit/test_controller.py b/python/ray/serve/tests/unit/test_controller.py
index 56bcd6d73268..faf6b020e2c2 100644
--- a/python/ray/serve/tests/unit/test_controller.py
+++ b/python/ray/serve/tests/unit/test_controller.py
@@ -2,10 +2,8 @@
 
 import pytest
 
-from ray.serve._private.common import DeploymentID, TargetCapacityDirection
+from ray.serve._private.common import TargetCapacityDirection
 from ray.serve._private.controller import (
-    _get_autoscaling_metrics_delay_tag_keys,
-    _get_autoscaling_metrics_delay_tags,
     applications_match,
     calculate_target_capacity_direction,
 )
@@ -22,58 +20,6 @@
 )
 
 
-def test_autoscaling_metrics_delay_tags_include_high_cardinality_by_default(
-    monkeypatch,
-):
-    from ray.serve._private import controller
-
-    monkeypatch.setattr(
-        controller,
-        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
-        True,
-    )
-
-    assert _get_autoscaling_metrics_delay_tag_keys(("handle",)) == (
-        "deployment",
-        "application",
-        "handle",
-    )
-    assert _get_autoscaling_metrics_delay_tags(
-        DeploymentID(name="deployment", app_name="application"),
-        "handle",
-        "handle-id",
-    ) == {
-        "deployment": "deployment",
-        "application": "application",
-        "handle": "handle-id",
-    }
-
-
-def test_autoscaling_metrics_delay_tags_exclude_high_cardinality(
-    monkeypatch,
-):
-    from ray.serve._private import controller
-
-    monkeypatch.setattr(
-        controller,
-        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
-        False,
-    )
-
-    assert _get_autoscaling_metrics_delay_tag_keys(("replica",)) == (
-        "deployment",
-        "application",
-    )
-    assert _get_autoscaling_metrics_delay_tags(
-        DeploymentID(name="deployment", app_name="application"),
-        "replica",
-        "replica-id",
-    ) == {
-        "deployment": "deployment",
-        "application": "application",
-    }
-
-
 def create_app_config(name: str) -> ServeApplicationSchema:
     return ServeApplicationSchema(
         name=name, import_path=f"fake.{name}", route_prefix=f"/{name}"
diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py
index 56e1d2cbd029..1269d1083e65 100644
--- a/python/ray/serve/tests/unit/test_deployment_state.py
+++ b/python/ray/serve/tests/unit/test_deployment_state.py
@@ -7,10 +7,6 @@
 
 from ray._common.ray_constants import DEFAULT_MAX_CONCURRENCY_ASYNC
 from ray._raylet import NodeID
-from ray.serve._private import (
-    deployment_state as deployment_state_module,
-    replica as replica_module,
-)
 from ray.serve._private.autoscaling_state import AutoscalingStateManager
 from ray.serve._private.common import (
     RUNNING_REQUESTS_KEY,
@@ -55,15 +51,9 @@
     DeploymentVersion,
     ReplicaStartupStatus,
     ReplicaStateContainer,
-    _get_replica_lifecycle_metric_tag_keys,
-    _get_replica_lifecycle_metric_tags,
 )
 from ray.serve._private.exceptions import DeploymentIsBeingDeletedError
 from ray.serve._private.long_poll import LongPollNamespace
-from ray.serve._private.replica import (
-    _get_replica_metric_default_tag_keys,
-    _get_replica_metric_default_tags,
-)
 from ray.serve._private.test_utils import (
     MockDeploymentActorWrapper,
     MockPlacementGroup,
@@ -83,77 +73,6 @@
 TEST_DEPLOYMENT_ID_2 = DeploymentID(name="test_deployment_2", app_name="test_app")
 
 
-def test_replica_lifecycle_metric_tags_include_high_cardinality_by_default(
-    monkeypatch,
-):
-    monkeypatch.setattr(
-        deployment_state_module,
-        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
-        True,
-    )
-
-    assert _get_replica_lifecycle_metric_tag_keys() == (
-        "deployment",
-        "replica",
-        "application",
-    )
-    assert _get_replica_lifecycle_metric_tags("replica-id") == {"replica": "replica-id"}
-
-
-def test_replica_lifecycle_metric_tags_exclude_high_cardinality(monkeypatch):
-    monkeypatch.setattr(
-        deployment_state_module,
-        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
-        False,
-    )
-
-    assert _get_replica_lifecycle_metric_tag_keys() == (
-        "deployment",
-        "application",
-    )
-    assert _get_replica_lifecycle_metric_tags("replica-id") is None
-
-
-def test_replica_metric_default_tags_include_high_cardinality_by_default(
-    monkeypatch,
-):
-    monkeypatch.setattr(
-        replica_module,
-        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
-        True,
-    )
-
-    assert _get_replica_metric_default_tag_keys(("exception_name",)) == (
-        "deployment",
-        "replica",
-        "application",
-        "exception_name",
-    )
-    assert _get_replica_metric_default_tags(TEST_DEPLOYMENT_ID, "replica-id") == {
-        "deployment": "test_deployment",
-        "application": "test_app",
-        "replica": "replica-id",
-    }
-
-
-def test_replica_metric_default_tags_exclude_high_cardinality(monkeypatch):
-    monkeypatch.setattr(
-        replica_module,
-        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS",
-        False,
-    )
-
-    assert _get_replica_metric_default_tag_keys(("exception_name",)) == (
-        "deployment",
-        "application",
-        "exception_name",
-    )
-    assert _get_replica_metric_default_tags(TEST_DEPLOYMENT_ID, "replica-id") == {
-        "deployment": "test_deployment",
-        "application": "test_app",
-    }
-
-
 def deployment_info(
     version: Optional[str] = None,
     num_replicas: Optional[int] = 1,

From 1d168003be39086f7612b931dc54de8cea050df0 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 9 Jun 2026 16:13:36 +0000
Subject: [PATCH 07/23] Addressed reviewers comments: restructured tests,
 inlined tags

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/constants.py  |   5 +-
 python/ray/serve/_private/controller.py |  66 +++++------
 python/ray/serve/tests/BUILD.bazel      |  26 ++++
 python/ray/serve/tests/conftest.py      |  41 ++-----
 python/ray/serve/tests/test_metrics.py  | 151 ++++++++++++------------
 5 files changed, 146 insertions(+), 143 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 974a2f8c4c6d..2c03c5ad7b1d 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -1003,8 +1003,11 @@
 # Feature flag to include high-cardinality source tags on Serve controller metrics.
 # Disable this to keep deployment/application tags while dropping source identifiers
 # like replica IDs and handle IDs from controller-emitted metrics.
+RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR = (
+    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS"
+)
 RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS = get_env_bool(
-    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS", "1"
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR, "1"
 )
 
 # Feature flag to use compact (low-cardinality) namespace tags on long poll metrics.
diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index 4e1f1e37598b..df6a12eb86e5 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -115,32 +115,6 @@
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
-_AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS = ("deployment", "application")
-
-
-def _get_autoscaling_metrics_delay_tag_keys(
-    high_cardinality_tag_key: str,
-) -> Tuple[str, ...]:
-    """Return tag keys for controller autoscaling metrics delay gauges."""
-    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS + (high_cardinality_tag_key,)
-    return _AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS
-
-
-def _get_autoscaling_metrics_delay_tags(
-    deployment_id: DeploymentID,
-    high_cardinality_tag_key: str,
-    high_cardinality_tag_value: str,
-) -> Dict[str, str]:
-    """Return tags for controller autoscaling metrics delay gauge updates."""
-    tags = {
-        "deployment": deployment_id.name,
-        "application": deployment_id.app_name,
-    }
-    if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-        tags[high_cardinality_tag_key] = high_cardinality_tag_value
-    return tags
-
 
 # Used for testing purposes only. If this is set, the controller will crash
 # after writing each checkpoint with the specified probability.
@@ -391,14 +365,17 @@ def record_autoscaling_metrics_from_replica(
             replica_metric_report = decompress_metric_report(replica_metric_report)
         latency = time.time() - replica_metric_report.timestamp
         latency_ms = latency * 1000
+        tags = {
+            "deployment": replica_metric_report.replica_id.deployment_id.name,
+            "application": replica_metric_report.replica_id.deployment_id.app_name,
+        }
+        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            tags["replica"] = replica_metric_report.replica_id.unique_id
+
         # Record the metrics delay for observability
         self.replica_metrics_delay_gauge.set(
             latency_ms,
-            tags=_get_autoscaling_metrics_delay_tags(
-                replica_metric_report.replica_id.deployment_id,
-                "replica",
-                replica_metric_report.replica_id.unique_id,
-            ),
+            tags=tags,
         )
         # Track in health metrics
         self._health_metrics_tracker.record_replica_metrics_delay(latency_ms)
@@ -413,14 +390,17 @@ def record_autoscaling_metrics_from_handle(
             handle_metric_report = decompress_metric_report(handle_metric_report)
         latency = time.time() - handle_metric_report.timestamp
         latency_ms = latency * 1000
+        tags = {
+            "deployment": handle_metric_report.deployment_id.name,
+            "application": handle_metric_report.deployment_id.app_name,
+        }
+        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            tags["handle"] = handle_metric_report.handle_id
+
         # Record the metrics delay for observability
         self.handle_metrics_delay_gauge.set(
             latency_ms,
-            tags=_get_autoscaling_metrics_delay_tags(
-                handle_metric_report.deployment_id,
-                "handle",
-                handle_metric_report.handle_id,
-            ),
+            tags=tags,
         )
         # Track in health metrics
         self._health_metrics_tracker.record_handle_metrics_delay(latency_ms)
@@ -787,7 +767,11 @@ def _create_control_loop_metrics(self):
                 "Time taken for the replica metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=_get_autoscaling_metrics_delay_tag_keys("replica"),
+            tag_keys=(
+                ("deployment", "application", "replica")
+                if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS
+                else ("deployment", "application")
+            ),
         )
         self.handle_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_handle_metrics_delay_ms",
@@ -795,7 +779,11 @@ def _create_control_loop_metrics(self):
                 "Time taken for the handle metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=_get_autoscaling_metrics_delay_tag_keys("handle"),
+            tag_keys=(
+                ("deployment", "application", "handle")
+                if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS
+                else ("deployment", "application")
+            ),
         )
         self.async_inference_task_queue_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_async_inference_task_queue_metrics_delay_ms",
@@ -803,7 +791,7 @@ def _create_control_loop_metrics(self):
                 "Time taken for the async inference task queue metrics to be reported "
                 "to the controller. High values may indicate a busy controller."
             ),
-            tag_keys=_AUTOSCALING_METRICS_DELAY_BASE_TAG_KEYS,
+            tag_keys=("deployment", "application"),
         )
 
     def _recover_state_from_checkpoint(self):
diff --git a/python/ray/serve/tests/BUILD.bazel b/python/ray/serve/tests/BUILD.bazel
index 3a5ac195b2d7..3041b3f31927 100644
--- a/python/ray/serve/tests/BUILD.bazel
+++ b/python/ray/serve/tests/BUILD.bazel
@@ -346,6 +346,32 @@ py_test_module_list(
     ],
 )
 
+# Run the controller metric high-cardinality opt-out test with the flag disabled.
+py_test_module_list(
+    size = "medium",
+    args = [
+        "-k",
+        "test_disable_high_cardinality_controller_metrics",
+    ],
+    env = {
+        "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS": "0",
+    },
+    files = [
+        "test_metrics.py",
+    ],
+    name_suffix = "_controller_metrics_without_high_cardinality",
+    tags = [
+        "exclusive",
+        "team:serve",
+        "use_all_core_windows",
+    ],
+    deps = [
+        ":common",
+        ":conftest",
+        "//python/ray/serve:serve_lib",
+    ],
+)
+
 # Deployment actors tests: split into own target with size=large because
 # the test file has 39 tests including heavyweight crash-recovery tests
 # with 120s waits that exceed the medium/long (900s) timeout under CI load.
diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py
index 30a469e14c48..04647c73238d 100644
--- a/python/ray/serve/tests/conftest.py
+++ b/python/ray/serve/tests/conftest.py
@@ -361,28 +361,19 @@ def ready():
 
 @pytest.fixture
 def metrics_start_shutdown(request):
-    """Fixture provides a fresh Ray cluster to prevent metrics state sharing."""
     param = request.param if hasattr(request, "param") else None
-    if isinstance(param, dict):
-        request_timeout_s = param.get("request_timeout_s")
-        env_vars = param.get("env_vars", {})
-    else:
-        request_timeout_s = param if param else None
-        env_vars = {}
-    old_env_vars = {key: os.environ.get(key) for key in env_vars}
-    for key, value in env_vars.items():
-        os.environ[key] = str(value)
+    request_timeout_s = param if param else None
+    """Fixture provides a fresh Ray cluster to prevent metrics state sharing."""
+    wait_for_metrics_port_free()
+    ray.init(
+        _metrics_export_port=TEST_METRICS_EXPORT_PORT,
+        _system_config={
+            "metrics_report_interval_ms": 100,
+            "task_retry_delay_ms": 50,
+        },
+    )
 
     try:
-        wait_for_metrics_port_free()
-        ray.init(
-            _metrics_export_port=TEST_METRICS_EXPORT_PORT,
-            _system_config={
-                "metrics_report_interval_ms": 100,
-                "task_retry_delay_ms": 50,
-            },
-        )
-
         session_name = ray._private.worker._global_node.session_name
         wait_for_metrics_endpoint(session_name)
 
@@ -403,15 +394,9 @@ def metrics_start_shutdown(request):
             ),
         )
     finally:
-        if ray.is_initialized():
-            serve.shutdown()
-            ray.shutdown()
-            reset_ray_address()
-        for key, old_value in old_env_vars.items():
-            if old_value is None:
-                os.environ.pop(key, None)
-            else:
-                os.environ[key] = old_value
+        serve.shutdown()
+        ray.shutdown()
+        reset_ray_address()
 
 
 # Helper function to return the node ID of a remote worker.
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index d3f76fd43acc..c461c7b1b953 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -25,18 +25,10 @@
     fetch_prometheus_metric_timeseries,
     wait_for_condition,
 )
-from ray.serve._private.common import (
-    DeploymentHandleSource,
-    DeploymentID,
-    HandleMetricReport,
-    ReplicaID,
-    ReplicaMetricReport,
-    TimeStampedValue,
-)
 from ray.serve._private.constants import (
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
-    SERVE_CONTROLLER_NAME,
-    SERVE_NAMESPACE,
 )
 from ray.serve._private.test_utils import (
     PROMETHEUS_METRICS_TIMEOUT_S,
@@ -53,7 +45,7 @@
 from ray.serve.generated import serve_pb2, serve_pb2_grpc
 
 CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR = (
-    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS"
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR
 )
 
 
@@ -1282,39 +1274,39 @@ def metrics_available():
     ), f"Latency metrics should use route patterns. Found: {latency_routes}"
 
 
-@pytest.mark.parametrize(
-    "metrics_start_shutdown, include_high_cardinality",
-    [
-        (
-            {"env_vars": {CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR: "1"}},
-            True,
-        ),
-        (
-            {"env_vars": {CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR: "0"}},
-            False,
-        ),
-    ],
-    ids=["include_high_cardinality", "exclude_high_cardinality"],
-    indirect=["metrics_start_shutdown"],
-)
-def test_controller_high_cardinality_metric_tags(
-    metrics_start_shutdown, include_high_cardinality
-):
+def _check_controller_high_cardinality_metric_tags(include_high_cardinality: bool):
     """Test controller metrics respect high-cardinality tag config."""
+    signal = SignalActor.remote()
 
     @serve.deployment(
         name="autoscaling_metrics_model",
-        health_check_period_s=0.1,
-        health_check_timeout_s=1,
         autoscaling_config={
             "min_replicas": 1,
-            "max_replicas": 2,
-            "target_ongoing_requests": 1,
+            "max_replicas": 5,
+            "target_ongoing_requests": 2,
             "metrics_interval_s": 0.1,
+            "upscale_delay_s": 0,
+            "downscale_delay_s": 5,
             "look_back_period_s": 1,
         },
+        max_ongoing_requests=10,
+        graceful_shutdown_timeout_s=0.1,
     )
-    class Model:
+    class AutoscalingModel:
+        async def __call__(self):
+            await signal.wait.remote()
+            return "hello"
+
+        async def record_autoscaling_stats(self):
+            return {"custom_metric": 1}
+
+    @serve.deployment(
+        name="lifecycle_metrics_model",
+        health_check_period_s=0.1,
+        health_check_timeout_s=1,
+        graceful_shutdown_timeout_s=0.1,
+    )
+    class LifecycleModel:
         def __init__(self):
             self.should_fail_health_check = False
 
@@ -1328,54 +1320,38 @@ async def check_health(self):
             if self.should_fail_health_check:
                 raise RuntimeError("Intentional health check failure.")
 
-    app_name = "autoscaling_metrics_app"
-    deployment_name = "autoscaling_metrics_model"
-    deployment_id = DeploymentID(deployment_name, app_name)
-    replica_id = ReplicaID("test-replica-id", deployment_id)
-    serve.run(Model.bind(), name=app_name)
+    autoscaling_app_name = "autoscaling_metrics_app"
+    autoscaling_deployment_name = "autoscaling_metrics_model"
+    lifecycle_app_name = "lifecycle_metrics_app"
+    lifecycle_deployment_name = "lifecycle_metrics_model"
+    serve.run(
+        AutoscalingModel.bind(),
+        name=autoscaling_app_name,
+        route_prefix="/autoscaling",
+    )
+    serve.run(
+        LifecycleModel.bind(),
+        name=lifecycle_app_name,
+        route_prefix="/lifecycle",
+    )
 
-    url = get_application_url("HTTP", app_name)
-    assert httpx.get(url).text == "hello"
+    url = get_application_url("HTTP", lifecycle_app_name)
     assert httpx.request("GET", url, content=b"fail_health_check").text == "hello"
-
-    controller = ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE)
-    now = time.time()
-    ray.get(
-        [
-            controller.record_autoscaling_metrics_from_handle.remote(
-                HandleMetricReport(
-                    deployment_id=deployment_id,
-                    handle_id="test-handle-id",
-                    actor_id="test-actor-id",
-                    handle_source=DeploymentHandleSource.UNKNOWN,
-                    aggregated_queued_requests=0,
-                    queued_requests=[TimeStampedValue(now, 0)],
-                    aggregated_metrics={},
-                    metrics={},
-                    timestamp=now,
-                )
-            ),
-            controller.record_autoscaling_metrics_from_replica.remote(
-                ReplicaMetricReport(
-                    replica_id=replica_id,
-                    aggregated_metrics={},
-                    metrics={},
-                    timestamp=now,
-                )
-            ),
-        ]
+    handle = serve.get_deployment_handle(
+        autoscaling_deployment_name, autoscaling_app_name
     )
+    [handle.remote() for _ in range(10)]
 
     timeseries = PrometheusTimeseries()
 
-    def get_matching_metrics(metric_name: str):
+    def get_matching_metrics(metric_name: str, deployment: str, application: str):
         return [
             metric
             for metric in get_metric_dictionaries(
                 metric_name, timeseries=timeseries, wait=False
             )
-            if metric.get("deployment") == deployment_name
-            and metric.get("application") == app_name
+            if metric.get("deployment") == deployment
+            and metric.get("application") == application
         ]
 
     def assert_high_cardinality_tag(metric, tag):
@@ -1383,13 +1359,19 @@ def assert_high_cardinality_tag(metric, tag):
 
     def check_controller_metric_tags():
         health_failure_metrics = get_matching_metrics(
-            "ray_serve_health_check_failures_total"
+            "ray_serve_health_check_failures_total",
+            lifecycle_deployment_name,
+            lifecycle_app_name,
         )
         handle_metrics = get_matching_metrics(
-            "ray_serve_autoscaling_handle_metrics_delay_ms"
+            "ray_serve_autoscaling_handle_metrics_delay_ms",
+            autoscaling_deployment_name,
+            autoscaling_app_name,
         )
         replica_metrics = get_matching_metrics(
-            "ray_serve_autoscaling_replica_metrics_delay_ms"
+            "ray_serve_autoscaling_replica_metrics_delay_ms",
+            autoscaling_deployment_name,
+            autoscaling_app_name,
         )
         if not health_failure_metrics or not handle_metrics or not replica_metrics:
             return False
@@ -1403,7 +1385,26 @@ def check_controller_metric_tags():
 
         return True
 
-    wait_for_condition(check_controller_metric_tags, timeout=60)
+    try:
+        wait_for_condition(check_controller_metric_tags, timeout=60)
+    finally:
+        ray.get(signal.send.remote())
+
+
+@pytest.mark.skipif(
+    not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
+    reason=f"{CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR}=0",
+)
+def test_controller_high_cardinality_metric_tags(metrics_start_shutdown):
+    _check_controller_high_cardinality_metric_tags(include_high_cardinality=True)
+
+
+@pytest.mark.skipif(
+    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
+    reason=f"{CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR}=1",
+)
+def test_disable_high_cardinality_controller_metrics(metrics_start_shutdown):
+    _check_controller_high_cardinality_metric_tags(include_high_cardinality=False)
 
 
 def test_routing_stats_delay_metric(metrics_start_shutdown):

From 17d680582d19b823e746f3e4d2f28e62eb8c8032 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 9 Jun 2026 16:40:18 +0000
Subject: [PATCH 08/23] Addressed cursor comment: make sure shared health gauge
 does not show healthy when a replica is not

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/deployment_state.py | 13 ++++
 python/ray/serve/tests/test_metrics.py        | 70 ++++++++++++++++---
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 144dc1c99921..bdb42d7b59d4 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -4580,6 +4580,19 @@ def check_and_update_replicas(self):
         if self._in_transition:
             self._check_and_update_transitioning_replicas()
 
+        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS and (
+            healthy_replicas or unhealthy_replicas
+        ):
+            # With the replica tag disabled, all replica writes collapse to a
+            # single deployment/application series. Overwrite it once with
+            # deployment-level health so iteration order cannot report healthy
+            # while a replica is failing.
+            deployment_is_healthy = (
+                len(unhealthy_replicas) == 0
+                and self._curr_status_info.status == DeploymentStatus.HEALTHY
+            )
+            self.health_check_gauge.set(int(deployment_is_healthy))
+
         # After replica state updates, check rank consistency and perform minimal reassignment if needed
         # This ensures ranks are continuous after lifecycle events
         # Only do consistency check when deployment is stable (not during active updates)
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index c461c7b1b953..d9f2f4698fec 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -1276,7 +1276,20 @@ def metrics_available():
 
 def _check_controller_high_cardinality_metric_tags(include_high_cardinality: bool):
     """Test controller metrics respect high-cardinality tag config."""
+
+    @ray.remote
+    class ReplicaHealthState:
+        def __init__(self):
+            self.failing_replica_ids = set()
+
+        def add_failing_replica(self, replica_id: str):
+            self.failing_replica_ids.add(replica_id)
+
+        def should_fail_health_check(self, replica_id: str) -> bool:
+            return replica_id in self.failing_replica_ids
+
     signal = SignalActor.remote()
+    replica_health_state = ReplicaHealthState.remote()
 
     @serve.deployment(
         name="autoscaling_metrics_model",
@@ -1302,22 +1315,21 @@ async def record_autoscaling_stats(self):
 
     @serve.deployment(
         name="lifecycle_metrics_model",
+        num_replicas=2,
         health_check_period_s=0.1,
         health_check_timeout_s=1,
         graceful_shutdown_timeout_s=0.1,
     )
     class LifecycleModel:
-        def __init__(self):
-            self.should_fail_health_check = False
-
-        async def __call__(self, request: Request):
-            body = await request.body()
-            if body == b"fail_health_check":
-                self.should_fail_health_check = True
-            return "hello"
+        async def __call__(self):
+            return serve.get_replica_context().replica_tag
 
         async def check_health(self):
-            if self.should_fail_health_check:
+            replica_id = serve.get_replica_context().replica_tag
+            should_fail_health_check = (
+                await replica_health_state.should_fail_health_check.remote(replica_id)
+            )
+            if should_fail_health_check:
                 raise RuntimeError("Intentional health check failure.")
 
     autoscaling_app_name = "autoscaling_metrics_app"
@@ -1336,7 +1348,18 @@ async def check_health(self):
     )
 
     url = get_application_url("HTTP", lifecycle_app_name)
-    assert httpx.request("GET", url, content=b"fail_health_check").text == "hello"
+    lifecycle_replica_ids = set()
+
+    def check_lifecycle_replicas_running():
+        lifecycle_replica_ids.add(httpx.get(url).text)
+        return len(lifecycle_replica_ids) == 2
+
+    wait_for_condition(check_lifecycle_replicas_running, timeout=60)
+    ray.get(
+        replica_health_state.add_failing_replica.remote(
+            sorted(lifecycle_replica_ids)[0]
+        )
+    )
     handle = serve.get_deployment_handle(
         autoscaling_deployment_name, autoscaling_app_name
     )
@@ -1363,6 +1386,11 @@ def check_controller_metric_tags():
             lifecycle_deployment_name,
             lifecycle_app_name,
         )
+        health_status_metrics = get_matching_metrics(
+            "ray_serve_deployment_replica_healthy",
+            lifecycle_deployment_name,
+            lifecycle_app_name,
+        )
         handle_metrics = get_matching_metrics(
             "ray_serve_autoscaling_handle_metrics_delay_ms",
             autoscaling_deployment_name,
@@ -1373,16 +1401,36 @@ def check_controller_metric_tags():
             autoscaling_deployment_name,
             autoscaling_app_name,
         )
-        if not health_failure_metrics or not handle_metrics or not replica_metrics:
+        if (
+            not health_failure_metrics
+            or not health_status_metrics
+            or not handle_metrics
+            or not replica_metrics
+        ):
             return False
 
         for metric in health_failure_metrics:
             assert_high_cardinality_tag(metric, "replica")
+        for metric in health_status_metrics:
+            assert_high_cardinality_tag(metric, "replica")
         for metric in handle_metrics:
             assert_high_cardinality_tag(metric, "handle")
         for metric in replica_metrics:
             assert_high_cardinality_tag(metric, "replica")
 
+        if not include_high_cardinality:
+            health_status_value = get_metric_float(
+                "ray_serve_deployment_replica_healthy",
+                {
+                    "deployment": lifecycle_deployment_name,
+                    "application": lifecycle_app_name,
+                },
+                timeseries=timeseries,
+                timeout=PROMETHEUS_METRICS_TIMEOUT_S,
+            )
+            if health_status_value != 0:
+                return False
+
         return True
 
     try:

From 6268e0cff33e5dcf48b758c33d4568806b8b37c6 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 9 Jun 2026 19:55:40 +0000
Subject: [PATCH 09/23] Improved readability and improved comment

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/deployment_state.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index bdb42d7b59d4..937a6f7e08bc 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -4583,15 +4583,15 @@ def check_and_update_replicas(self):
         if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS and (
             healthy_replicas or unhealthy_replicas
         ):
-            # With the replica tag disabled, all replica writes collapse to a
-            # single deployment/application series. Overwrite it once with
-            # deployment-level health so iteration order cannot report healthy
-            # while a replica is failing.
+            # _set_health_gauge() is still called once per replica above. When
+            # the replica tag is disabled, those writes all target the same
+            # time series, so the last replica processed would otherwise decide
+            # the value. Write the aggregate value last.
             deployment_is_healthy = (
-                len(unhealthy_replicas) == 0
+                not unhealthy_replicas
                 and self._curr_status_info.status == DeploymentStatus.HEALTHY
             )
-            self.health_check_gauge.set(int(deployment_is_healthy))
+            self.health_check_gauge.set(1 if deployment_is_healthy else 0)
 
         # After replica state updates, check rank consistency and perform minimal reassignment if needed
         # This ensures ranks are continuous after lifecycle events

From 220c99074913175e04470000de425c291f405b7a Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Wed, 10 Jun 2026 15:38:18 +0000
Subject: [PATCH 10/23] Remove unneeded gauge update, cleaned up
 RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/constants.py        | 5 +----
 python/ray/serve/_private/deployment_state.py | 9 +++------
 python/ray/serve/tests/test_metrics.py        | 9 ++-------
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 2c03c5ad7b1d..974a2f8c4c6d 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -1003,11 +1003,8 @@
 # Feature flag to include high-cardinality source tags on Serve controller metrics.
 # Disable this to keep deployment/application tags while dropping source identifiers
 # like replica IDs and handle IDs from controller-emitted metrics.
-RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR = (
-    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS"
-)
 RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS = get_env_bool(
-    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR, "1"
+    "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS", "1"
 )
 
 # Feature flag to use compact (low-cardinality) namespace tags on long poll metrics.
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 937a6f7e08bc..f4d5aa231805 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -4367,8 +4367,6 @@ def _set_health_gauge(self, replica_unique_id: str, value: int) -> None:
             return
         if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             self.health_check_gauge.set(value, tags={"replica": replica_unique_id})
-        else:
-            self.health_check_gauge.set(value)
         self._health_gauge_cache[replica_unique_id] = (value, now)
 
     def _register_gang_replica(self, replica_id: ReplicaID, gang_id: str) -> None:
@@ -4583,10 +4581,9 @@ def check_and_update_replicas(self):
         if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS and (
             healthy_replicas or unhealthy_replicas
         ):
-            # _set_health_gauge() is still called once per replica above. When
-            # the replica tag is disabled, those writes all target the same
-            # time series, so the last replica processed would otherwise decide
-            # the value. Write the aggregate value last.
+            # When the replica tag is disabled, this is a single
+            # deployment/application series. Emit it once with the aggregate
+            # value so per-replica iteration order cannot decide the result.
             deployment_is_healthy = (
                 not unhealthy_replicas
                 and self._curr_status_info.status == DeploymentStatus.HEALTHY
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index d9f2f4698fec..9d07c1619f4c 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -27,7 +27,6 @@
 )
 from ray.serve._private.constants import (
     RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
-    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
 )
 from ray.serve._private.test_utils import (
@@ -44,10 +43,6 @@
 from ray.serve.config import RequestRouterConfig
 from ray.serve.generated import serve_pb2, serve_pb2_grpc
 
-CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR = (
-    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS_ENV_VAR
-)
-
 
 def extract_tags(line: str) -> Dict[str, str]:
     """Extracts any tags from the metrics line."""
@@ -1441,7 +1436,7 @@ def check_controller_metric_tags():
 
 @pytest.mark.skipif(
     not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
-    reason=f"{CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR}=0",
+    reason="controller metric high-cardinality tags are disabled",
 )
 def test_controller_high_cardinality_metric_tags(metrics_start_shutdown):
     _check_controller_high_cardinality_metric_tags(include_high_cardinality=True)
@@ -1449,7 +1444,7 @@ def test_controller_high_cardinality_metric_tags(metrics_start_shutdown):
 
 @pytest.mark.skipif(
     RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
-    reason=f"{CONTROLLER_HIGH_CARDINALITY_TAGS_ENV_VAR}=1",
+    reason="controller metric high-cardinality tags are enabled",
 )
 def test_disable_high_cardinality_controller_metrics(metrics_start_shutdown):
     _check_controller_high_cardinality_metric_tags(include_high_cardinality=False)

From 6c40e9bde97dca5c7d34215dc1eba466694b1c31 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Thu, 11 Jun 2026 02:39:25 +0000
Subject: [PATCH 11/23] Addressed cursor comments

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/deployment_state.py | 13 ++--
 python/ray/serve/tests/test_metrics.py        | 70 ++++++++++++++++---
 2 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index f4d5aa231805..0337ef36a5a6 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -4578,14 +4578,19 @@ def check_and_update_replicas(self):
         if self._in_transition:
             self._check_and_update_transitioning_replicas()
 
-        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS and (
-            healthy_replicas or unhealthy_replicas
-        ):
+        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             # When the replica tag is disabled, this is a single
             # deployment/application series. Emit it once with the aggregate
             # value so per-replica iteration order cannot decide the result.
+            has_running_replica = (
+                self._replicas.count(
+                    states=[ReplicaState.RUNNING, ReplicaState.PENDING_MIGRATION]
+                )
+                > 0
+            )
             deployment_is_healthy = (
-                not unhealthy_replicas
+                has_running_replica
+                and not unhealthy_replicas
                 and self._curr_status_info.status == DeploymentStatus.HEALTHY
             )
             self.health_check_gauge.set(1 if deployment_is_healthy else 0)
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index 9d07c1619f4c..474e02ec7ae0 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -1327,10 +1327,32 @@ async def check_health(self):
             if should_fail_health_check:
                 raise RuntimeError("Intentional health check failure.")
 
+    @serve.deployment(
+        name="scale_down_metrics_model",
+        autoscaling_config={
+            "min_replicas": 0,
+            "max_replicas": 1,
+            "initial_replicas": 1,
+            "target_ongoing_requests": 1,
+            "metrics_interval_s": 0.1,
+            "look_back_period_s": 0.1,
+            "upscale_delay_s": 0,
+            "downscale_delay_s": 5,
+        },
+        max_ongoing_requests=1,
+        health_check_period_s=0.1,
+        graceful_shutdown_timeout_s=0.1,
+    )
+    class ScaleDownModel:
+        async def __call__(self):
+            return "hello"
+
     autoscaling_app_name = "autoscaling_metrics_app"
     autoscaling_deployment_name = "autoscaling_metrics_model"
     lifecycle_app_name = "lifecycle_metrics_app"
     lifecycle_deployment_name = "lifecycle_metrics_model"
+    scale_down_app_name = "scale_down_metrics_app"
+    scale_down_deployment_name = "scale_down_metrics_model"
     serve.run(
         AutoscalingModel.bind(),
         name=autoscaling_app_name,
@@ -1350,6 +1372,42 @@ def check_lifecycle_replicas_running():
         return len(lifecycle_replica_ids) == 2
 
     wait_for_condition(check_lifecycle_replicas_running, timeout=60)
+    timeseries = PrometheusTimeseries()
+
+    def get_health_status_value(deployment: str, application: str) -> float:
+        return get_metric_float(
+            "ray_serve_deployment_replica_healthy",
+            {
+                "deployment": deployment,
+                "application": application,
+            },
+            timeseries=timeseries,
+            timeout=PROMETHEUS_METRICS_TIMEOUT_S,
+        )
+
+    if not include_high_cardinality:
+        serve.run(
+            ScaleDownModel.bind(),
+            name=scale_down_app_name,
+            route_prefix="/scale-down",
+        )
+
+        wait_for_condition(
+            lambda: get_health_status_value(
+                scale_down_deployment_name, scale_down_app_name
+            )
+            == 1,
+            timeout=60,
+        )
+
+        wait_for_condition(
+            lambda: get_health_status_value(
+                scale_down_deployment_name, scale_down_app_name
+            )
+            == 0,
+            timeout=60,
+        )
+
     ray.get(
         replica_health_state.add_failing_replica.remote(
             sorted(lifecycle_replica_ids)[0]
@@ -1360,8 +1418,6 @@ def check_lifecycle_replicas_running():
     )
     [handle.remote() for _ in range(10)]
 
-    timeseries = PrometheusTimeseries()
-
     def get_matching_metrics(metric_name: str, deployment: str, application: str):
         return [
             metric
@@ -1414,14 +1470,8 @@ def check_controller_metric_tags():
             assert_high_cardinality_tag(metric, "replica")
 
         if not include_high_cardinality:
-            health_status_value = get_metric_float(
-                "ray_serve_deployment_replica_healthy",
-                {
-                    "deployment": lifecycle_deployment_name,
-                    "application": lifecycle_app_name,
-                },
-                timeseries=timeseries,
-                timeout=PROMETHEUS_METRICS_TIMEOUT_S,
+            health_status_value = get_health_status_value(
+                lifecycle_deployment_name, lifecycle_app_name
             )
             if health_status_value != 0:
                 return False

From 1fa0edd002ae879d2689a3927a34a168b9f9d4f1 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Thu, 11 Jun 2026 14:40:23 +0000
Subject: [PATCH 12/23] Fix controller metrics cardinality test

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/tests/test_metrics.py | 59 ++++----------------------
 1 file changed, 8 insertions(+), 51 deletions(-)

diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index 474e02ec7ae0..871407481903 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -1310,7 +1310,7 @@ async def record_autoscaling_stats(self):
 
     @serve.deployment(
         name="lifecycle_metrics_model",
-        num_replicas=2,
+        num_replicas=1,
         health_check_period_s=0.1,
         health_check_timeout_s=1,
         graceful_shutdown_timeout_s=0.1,
@@ -1327,32 +1327,10 @@ async def check_health(self):
             if should_fail_health_check:
                 raise RuntimeError("Intentional health check failure.")
 
-    @serve.deployment(
-        name="scale_down_metrics_model",
-        autoscaling_config={
-            "min_replicas": 0,
-            "max_replicas": 1,
-            "initial_replicas": 1,
-            "target_ongoing_requests": 1,
-            "metrics_interval_s": 0.1,
-            "look_back_period_s": 0.1,
-            "upscale_delay_s": 0,
-            "downscale_delay_s": 5,
-        },
-        max_ongoing_requests=1,
-        health_check_period_s=0.1,
-        graceful_shutdown_timeout_s=0.1,
-    )
-    class ScaleDownModel:
-        async def __call__(self):
-            return "hello"
-
     autoscaling_app_name = "autoscaling_metrics_app"
     autoscaling_deployment_name = "autoscaling_metrics_model"
     lifecycle_app_name = "lifecycle_metrics_app"
     lifecycle_deployment_name = "lifecycle_metrics_model"
-    scale_down_app_name = "scale_down_metrics_app"
-    scale_down_deployment_name = "scale_down_metrics_model"
     serve.run(
         AutoscalingModel.bind(),
         name=autoscaling_app_name,
@@ -1367,11 +1345,13 @@ async def __call__(self):
     url = get_application_url("HTTP", lifecycle_app_name)
     lifecycle_replica_ids = set()
 
-    def check_lifecycle_replicas_running():
-        lifecycle_replica_ids.add(httpx.get(url).text)
-        return len(lifecycle_replica_ids) == 2
+    def check_lifecycle_replica_running():
+        response = httpx.get(url)
+        response.raise_for_status()
+        lifecycle_replica_ids.add(response.text)
+        return len(lifecycle_replica_ids) == 1
 
-    wait_for_condition(check_lifecycle_replicas_running, timeout=60)
+    wait_for_condition(check_lifecycle_replica_running, timeout=60)
     timeseries = PrometheusTimeseries()
 
     def get_health_status_value(deployment: str, application: str) -> float:
@@ -1385,29 +1365,6 @@ def get_health_status_value(deployment: str, application: str) -> float:
             timeout=PROMETHEUS_METRICS_TIMEOUT_S,
         )
 
-    if not include_high_cardinality:
-        serve.run(
-            ScaleDownModel.bind(),
-            name=scale_down_app_name,
-            route_prefix="/scale-down",
-        )
-
-        wait_for_condition(
-            lambda: get_health_status_value(
-                scale_down_deployment_name, scale_down_app_name
-            )
-            == 1,
-            timeout=60,
-        )
-
-        wait_for_condition(
-            lambda: get_health_status_value(
-                scale_down_deployment_name, scale_down_app_name
-            )
-            == 0,
-            timeout=60,
-        )
-
     ray.get(
         replica_health_state.add_failing_replica.remote(
             sorted(lifecycle_replica_ids)[0]
@@ -1945,4 +1902,4 @@ def check_objref_resolution_metric_value():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(["-v", "-s", __file__]))
+    sys.exit(pytest.main(["-v", "-s"] + sys.argv[1:] + [__file__]))

From 57cb17d85d047ca97966668e7c45bfcd71248dd8 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Thu, 11 Jun 2026 20:27:57 +0000
Subject: [PATCH 13/23] Emit healthy replica count without replica metric tag

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 doc/source/serve/monitoring.md                |  4 +-
 python/ray/serve/_private/deployment_state.py | 30 ++++------
 python/ray/serve/tests/test_metrics.py        | 60 ++++++++++---------
 3 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
index defa0a2c7dbd..c3f0313a0e72 100644
--- a/doc/source/serve/monitoring.md
+++ b/doc/source/serve/monitoring.md
@@ -749,13 +749,13 @@ This setting doesn't affect replica-emitted metrics such as
 
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
-| `ray_serve_deployment_replica_healthy` | Gauge | `deployment`, `replica`, `application` | Health status of the replica: `1` = healthy, `0` = unhealthy. |
+| `ray_serve_deployment_replica_healthy` | Gauge | `deployment`, `application`, `replica` by default; `deployment`, `application` when `RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` | Tracks healthy replicas. With `replica`, each replica series is `1` for healthy and `0` for unhealthy. Without `replica`, the deployment/application series is the healthy replica count. |
 | `ray_serve_deployment_replica_starts_total` | Counter | `deployment`, `replica`, `application` | Total number of times the replica has started (including restarts due to failure). |
 | `ray_serve_replica_startup_latency_ms` | Histogram | `deployment`, `application` | Total time from replica creation to ready state in milliseconds. Includes node provisioning (if needed on VM or Kubernetes), runtime environment bootstrap (pip install, Docker image pull, etc.), Ray actor scheduling, and actor constructor execution. Useful for debugging slow cold starts. |
 | `ray_serve_replica_initialization_latency_ms` | Histogram | `deployment`, `application` | Time for the actor constructor to run in milliseconds. This is a subset of `ray_serve_replica_startup_latency_ms`. |
 | `ray_serve_replica_reconfigure_latency_ms` | Histogram | `deployment`, `application` | Time in milliseconds for a replica to complete reconfiguration. Includes both reconfigure time and one control-loop iteration, so very low values may be unreliable. |
 | `ray_serve_health_check_latency_ms` | Histogram | `deployment`, `application` | Duration of health check calls in milliseconds. Useful for identifying slow health checks blocking scaling. |
-| `ray_serve_health_check_failures_total` | Counter | `deployment`, `replica`, `application` | Total number of failed health checks. Provides early warning before replica is marked unhealthy. |
+| `ray_serve_health_check_failures_total` | Counter | `deployment`, `application`, `replica` by default; `deployment`, `application` when `RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` | Total number of failed health checks. Provides early warning before replica is marked unhealthy. |
 | `ray_serve_replica_shutdown_duration_ms` | Histogram | `deployment`, `application` | Time from shutdown signal to replica fully stopped in milliseconds. Useful for debugging slow draining during scale-down or rolling updates. |
 
 ### Autoscaling metrics
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 0337ef36a5a6..5d6a1411d585 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -2886,8 +2886,9 @@ def __init__(
         self.health_check_gauge = metrics.Gauge(
             "serve_deployment_replica_healthy",
             description=(
-                "Tracks whether this deployment replica is healthy. 1 means "
-                "healthy, 0 means unhealthy."
+                "Tracks healthy replicas. When source tags are enabled, each "
+                "replica series is 1 for healthy and 0 for unhealthy; otherwise, "
+                "the deployment/application series is the healthy replica count."
             ),
             tag_keys=replica_lifecycle_metric_tag_keys,
         )
@@ -4357,6 +4358,9 @@ def _set_health_gauge(self, replica_unique_id: str, value: int) -> None:
         every control-loop iteration while still refreshing the metric often
         enough for Prometheus export.
         """
+        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            return
+
         now = time.time()
         cached = self._health_gauge_cache.get(replica_unique_id)
         if (
@@ -4365,8 +4369,7 @@ def _set_health_gauge(self, replica_unique_id: str, value: int) -> None:
             and (now - cached[1]) < RAY_SERVE_STATUS_GAUGE_REPORT_INTERVAL_S
         ):
             return
-        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-            self.health_check_gauge.set(value, tags={"replica": replica_unique_id})
+        self.health_check_gauge.set(value, tags={"replica": replica_unique_id})
         self._health_gauge_cache[replica_unique_id] = (value, now)
 
     def _register_gang_replica(self, replica_id: ReplicaID, gang_id: str) -> None:
@@ -4580,20 +4583,13 @@ def check_and_update_replicas(self):
 
         if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             # When the replica tag is disabled, this is a single
-            # deployment/application series. Emit it once with the aggregate
-            # value so per-replica iteration order cannot decide the result.
-            has_running_replica = (
-                self._replicas.count(
-                    states=[ReplicaState.RUNNING, ReplicaState.PENDING_MIGRATION]
-                )
-                > 0
-            )
-            deployment_is_healthy = (
-                has_running_replica
-                and not unhealthy_replicas
-                and self._curr_status_info.status == DeploymentStatus.HEALTHY
+            # deployment/application series. Emit the aggregate count of
+            # healthy replicas so per-replica iteration order cannot decide
+            # the result.
+            healthy_replica_count = self._replicas.count(
+                states=[ReplicaState.RUNNING, ReplicaState.PENDING_MIGRATION]
             )
-            self.health_check_gauge.set(1 if deployment_is_healthy else 0)
+            self.health_check_gauge.set(healthy_replica_count)
 
         # After replica state updates, check rank consistency and perform minimal reassignment if needed
         # This ensures ranks are continuous after lifecycle events
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index 871407481903..6f96330521f9 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -1275,13 +1275,23 @@ def _check_controller_high_cardinality_metric_tags(include_high_cardinality: boo
     @ray.remote
     class ReplicaHealthState:
         def __init__(self):
-            self.failing_replica_ids = set()
+            self.replica_ids = set()
+            self.failures_enabled = False
+            self.failing_replica_id = None
 
-        def add_failing_replica(self, replica_id: str):
-            self.failing_replica_ids.add(replica_id)
+        def get_num_registered_replicas(self) -> int:
+            return len(self.replica_ids)
 
-        def should_fail_health_check(self, replica_id: str) -> bool:
-            return replica_id in self.failing_replica_ids
+        def enable_failures(self):
+            self.failures_enabled = True
+
+        def register_and_should_fail_health_check(self, replica_id: str) -> bool:
+            self.replica_ids.add(replica_id)
+            if not self.failures_enabled:
+                return False
+            if self.failing_replica_id is None:
+                self.failing_replica_id = replica_id
+            return replica_id == self.failing_replica_id
 
     signal = SignalActor.remote()
     replica_health_state = ReplicaHealthState.remote()
@@ -1310,7 +1320,7 @@ async def record_autoscaling_stats(self):
 
     @serve.deployment(
         name="lifecycle_metrics_model",
-        num_replicas=1,
+        num_replicas=2,
         health_check_period_s=0.1,
         health_check_timeout_s=1,
         graceful_shutdown_timeout_s=0.1,
@@ -1322,7 +1332,9 @@ async def __call__(self):
         async def check_health(self):
             replica_id = serve.get_replica_context().replica_tag
             should_fail_health_check = (
-                await replica_health_state.should_fail_health_check.remote(replica_id)
+                await replica_health_state.register_and_should_fail_health_check.remote(
+                    replica_id
+                )
             )
             if should_fail_health_check:
                 raise RuntimeError("Intentional health check failure.")
@@ -1342,16 +1354,10 @@ async def check_health(self):
         route_prefix="/lifecycle",
     )
 
-    url = get_application_url("HTTP", lifecycle_app_name)
-    lifecycle_replica_ids = set()
-
-    def check_lifecycle_replica_running():
-        response = httpx.get(url)
-        response.raise_for_status()
-        lifecycle_replica_ids.add(response.text)
-        return len(lifecycle_replica_ids) == 1
-
-    wait_for_condition(check_lifecycle_replica_running, timeout=60)
+    wait_for_condition(
+        lambda: ray.get(replica_health_state.get_num_registered_replicas.remote()) == 2,
+        timeout=60,
+    )
     timeseries = PrometheusTimeseries()
 
     def get_health_status_value(deployment: str, application: str) -> float:
@@ -1365,11 +1371,16 @@ def get_health_status_value(deployment: str, application: str) -> float:
             timeout=PROMETHEUS_METRICS_TIMEOUT_S,
         )
 
-    ray.get(
-        replica_health_state.add_failing_replica.remote(
-            sorted(lifecycle_replica_ids)[0]
+    if not include_high_cardinality:
+        wait_for_condition(
+            lambda: get_health_status_value(
+                lifecycle_deployment_name, lifecycle_app_name
+            )
+            == 2,
+            timeout=60,
         )
-    )
+
+    ray.get(replica_health_state.enable_failures.remote())
     handle = serve.get_deployment_handle(
         autoscaling_deployment_name, autoscaling_app_name
     )
@@ -1426,13 +1437,6 @@ def check_controller_metric_tags():
         for metric in replica_metrics:
             assert_high_cardinality_tag(metric, "replica")
 
-        if not include_high_cardinality:
-            health_status_value = get_health_status_value(
-                lifecycle_deployment_name, lifecycle_app_name
-            )
-            if health_status_value != 0:
-                return False
-
         return True
 
     try:

From f2b14035988a5509eedb1af3d1048a595dd1ac47 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Fri, 12 Jun 2026 19:03:18 +0000
Subject: [PATCH 14/23] Count health-checked replicas in aggregate gauge

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/deployment_state.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 5d6a1411d585..383508852256 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -4583,12 +4583,10 @@ def check_and_update_replicas(self):
 
         if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             # When the replica tag is disabled, this is a single
-            # deployment/application series. Emit the aggregate count of
-            # healthy replicas so per-replica iteration order cannot decide
-            # the result.
-            healthy_replica_count = self._replicas.count(
-                states=[ReplicaState.RUNNING, ReplicaState.PENDING_MIGRATION]
-            )
+            # deployment/application series. Emit the count of replicas that
+            # passed health checks in this iteration so newly promoted replicas
+            # are not counted before their first successful health check.
+            healthy_replica_count = len(healthy_replicas)
             self.health_check_gauge.set(healthy_replica_count)
 
         # After replica state updates, check rank consistency and perform minimal reassignment if needed

From 5102ae7a26a932e11bb1b7d3c71c58a7027b3551 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Fri, 12 Jun 2026 20:56:56 +0000
Subject: [PATCH 15/23] Aggregate low-cardinality metrics delay gauges

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/controller.py | 65 ++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index df6a12eb86e5..693a90437ed5 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -39,7 +39,10 @@
     RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
     RAY_SERVE_ENABLE_HA_PROXY,
+    RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
     RAY_SERVE_LOG_TO_STDERR,
+    RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S,
+    RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
     RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE,
     RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP,
     RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD,
@@ -125,6 +128,23 @@
 SHUTDOWN_IN_PROGRESS_KEY = "serve-shutdown-in-progress"
 
 
+def _get_aggregated_autoscaling_metrics_delay_ms(
+    delay_by_source: Dict[str, Tuple[float, float]],
+    source_id: str,
+    delay_ms: float,
+    timeout_s: float,
+) -> float:
+    """Return the max latest metrics delay for a deployment/application series."""
+
+    now = time.time()
+    delay_by_source[source_id] = (delay_ms, now)
+    for cached_source_id, (_, updated_at) in list(delay_by_source.items()):
+        if now - updated_at > timeout_s:
+            del delay_by_source[cached_source_id]
+
+    return max(cached_delay_ms for cached_delay_ms, _ in delay_by_source.values())
+
+
 class ServeController:
     """Responsible for managing the state of the serving system.
 
@@ -288,6 +308,12 @@ async def __init__(
         self._health_metrics_tracker = ControllerHealthMetricsTracker(
             controller_start_time=time.time()
         )
+        self._replica_metrics_delay_ms: Dict[
+            Tuple[str, str], Dict[str, Tuple[float, float]]
+        ] = {}
+        self._handle_metrics_delay_ms: Dict[
+            Tuple[str, str], Dict[str, Tuple[float, float]]
+        ] = {}
 
         self._create_control_loop_metrics()
         run_background_task(self.run_control_loop())
@@ -365,16 +391,29 @@ def record_autoscaling_metrics_from_replica(
             replica_metric_report = decompress_metric_report(replica_metric_report)
         latency = time.time() - replica_metric_report.timestamp
         latency_ms = latency * 1000
+        deployment = replica_metric_report.replica_id.deployment_id.name
+        application = replica_metric_report.replica_id.deployment_id.app_name
         tags = {
-            "deployment": replica_metric_report.replica_id.deployment_id.name,
-            "application": replica_metric_report.replica_id.deployment_id.app_name,
+            "deployment": deployment,
+            "application": application,
         }
         if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             tags["replica"] = replica_metric_report.replica_id.unique_id
 
+        metrics_delay_ms = latency_ms
+        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            metrics_delay_ms = _get_aggregated_autoscaling_metrics_delay_ms(
+                self._replica_metrics_delay_ms.setdefault(
+                    (deployment, application), {}
+                ),
+                replica_metric_report.replica_id.unique_id,
+                latency_ms,
+                2 * RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
+            )
+
         # Record the metrics delay for observability
         self.replica_metrics_delay_gauge.set(
-            latency_ms,
+            metrics_delay_ms,
             tags=tags,
         )
         # Track in health metrics
@@ -390,16 +429,30 @@ def record_autoscaling_metrics_from_handle(
             handle_metric_report = decompress_metric_report(handle_metric_report)
         latency = time.time() - handle_metric_report.timestamp
         latency_ms = latency * 1000
+        deployment = handle_metric_report.deployment_id.name
+        application = handle_metric_report.deployment_id.app_name
         tags = {
-            "deployment": handle_metric_report.deployment_id.name,
-            "application": handle_metric_report.deployment_id.app_name,
+            "deployment": deployment,
+            "application": application,
         }
         if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             tags["handle"] = handle_metric_report.handle_id
 
+        metrics_delay_ms = latency_ms
+        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            metrics_delay_ms = _get_aggregated_autoscaling_metrics_delay_ms(
+                self._handle_metrics_delay_ms.setdefault((deployment, application), {}),
+                handle_metric_report.handle_id,
+                latency_ms,
+                max(
+                    2 * RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
+                    RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S,
+                ),
+            )
+
         # Record the metrics delay for observability
         self.handle_metrics_delay_gauge.set(
-            latency_ms,
+            metrics_delay_ms,
             tags=tags,
         )
         # Track in health metrics

From fd391e4871c8234b31a6d36be6a0c016abcc28b5 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Fri, 12 Jun 2026 21:08:24 +0000
Subject: [PATCH 16/23] Refresh aggregate health gauge on replica stop

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/deployment_state.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index 383508852256..2fdef22c22eb 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -2868,6 +2868,7 @@ def __init__(
         # changes or the cache entry is older than _HEALTH_GAUGE_REPORT_INTERVAL_S
         # (to ensure the metric is re-exported within each Prometheus scrape window).
         self._health_gauge_cache: Dict[str, Tuple[int, float]] = {}
+        self._last_health_check_healthy_replica_ids: Set[str] = set()
 
         # Maintain gang membership bookkeeping to avoid O(num_replicas) lookups when stopping gangs.
         # Updated on replica creation during upscaling and permanent removal during downscaling.
@@ -4406,7 +4407,15 @@ def _stop_replica(self, replica: DeploymentReplica, graceful_stop=True):
         replica.stop(graceful=graceful_stop)
         self._replicas.add(ReplicaState.STOPPING, replica)
         self._deployment_scheduler.on_replica_stopping(replica.replica_id)
-        self._set_health_gauge(replica.replica_id.unique_id, 0)
+        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            self._set_health_gauge(replica.replica_id.unique_id, 0)
+        else:
+            self._last_health_check_healthy_replica_ids.discard(
+                replica.replica_id.unique_id
+            )
+            self.health_check_gauge.set(
+                len(self._last_health_check_healthy_replica_ids)
+            )
 
     def _stop_replica_mark_unhealthy_if_target_version(
         self, replica: DeploymentReplica, graceful_stop: bool
@@ -4586,8 +4595,12 @@ def check_and_update_replicas(self):
             # deployment/application series. Emit the count of replicas that
             # passed health checks in this iteration so newly promoted replicas
             # are not counted before their first successful health check.
-            healthy_replica_count = len(healthy_replicas)
-            self.health_check_gauge.set(healthy_replica_count)
+            self._last_health_check_healthy_replica_ids = {
+                replica.replica_id.unique_id for replica in healthy_replicas
+            }
+            self.health_check_gauge.set(
+                len(self._last_health_check_healthy_replica_ids)
+            )
 
         # After replica state updates, check rank consistency and perform minimal reassignment if needed
         # This ensures ranks are continuous after lifecycle events

From 88fc7a3e6d4b435f7036e2a184b8e12d9ce218dc Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Sun, 14 Jun 2026 00:55:11 +0000
Subject: [PATCH 17/23] Trigger CI

Signed-off-by: john.taylor <john.taylor@anyscale.com>

From d80e9a4fc5a8471f117865900f6aa1b5656c5b11 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Mon, 15 Jun 2026 15:39:00 +0000
Subject: [PATCH 18/23] Trigger CI

Signed-off-by: john.taylor <john.taylor@anyscale.com>

From 444bc282926592435465896fdf4b454370f948d0 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Mon, 15 Jun 2026 16:42:05 +0000
Subject: [PATCH 19/23] Use per-deployment metrics_interval_s for aggregated
 delay cache TTL.

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/controller.py       | 46 +++++++++-
 .../ray/serve/tests/unit/test_controller.py   | 85 +++++++++++++++++++
 2 files changed, 127 insertions(+), 4 deletions(-)

diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index 693a90437ed5..d8c57bea3ff1 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -145,6 +145,19 @@ def _get_aggregated_autoscaling_metrics_delay_ms(
     return max(cached_delay_ms for cached_delay_ms, _ in delay_by_source.values())
 
 
+def _get_autoscaling_metrics_delay_cache_timeout_s(
+    metrics_interval_s: float,
+    *,
+    for_handle: bool = False,
+) -> float:
+    """Return cache TTL aligned with per-deployment metrics push interval."""
+
+    timeout_s = 2 * metrics_interval_s
+    if for_handle:
+        timeout_s = max(timeout_s, RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S)
+    return timeout_s
+
+
 class ServeController:
     """Responsible for managing the state of the serving system.
 
@@ -384,6 +397,24 @@ def check_alive(self) -> None:
     def get_pid(self) -> int:
         return os.getpid()
 
+    def _get_deployment_metrics_interval_s(
+        self, deployment: str, application: str, *, for_handle: bool
+    ) -> float:
+        deployment_config = self.get_deployment_config(
+            DeploymentID(name=deployment, app_name=application)
+        )
+        if (
+            deployment_config is not None
+            and deployment_config.autoscaling_config is not None
+        ):
+            return deployment_config.autoscaling_config.metrics_interval_s
+
+        return (
+            RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+            if for_handle
+            else RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+        )
+
     def record_autoscaling_metrics_from_replica(
         self, replica_metric_report: Union[ReplicaMetricReport, bytes]
     ):
@@ -402,13 +433,18 @@ def record_autoscaling_metrics_from_replica(
 
         metrics_delay_ms = latency_ms
         if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            metrics_interval_s = self._get_deployment_metrics_interval_s(
+                deployment, application, for_handle=False
+            )
             metrics_delay_ms = _get_aggregated_autoscaling_metrics_delay_ms(
                 self._replica_metrics_delay_ms.setdefault(
                     (deployment, application), {}
                 ),
                 replica_metric_report.replica_id.unique_id,
                 latency_ms,
-                2 * RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
+                _get_autoscaling_metrics_delay_cache_timeout_s(
+                    metrics_interval_s, for_handle=False
+                ),
             )
 
         # Record the metrics delay for observability
@@ -440,13 +476,15 @@ def record_autoscaling_metrics_from_handle(
 
         metrics_delay_ms = latency_ms
         if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            metrics_interval_s = self._get_deployment_metrics_interval_s(
+                deployment, application, for_handle=True
+            )
             metrics_delay_ms = _get_aggregated_autoscaling_metrics_delay_ms(
                 self._handle_metrics_delay_ms.setdefault((deployment, application), {}),
                 handle_metric_report.handle_id,
                 latency_ms,
-                max(
-                    2 * RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
-                    RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S,
+                _get_autoscaling_metrics_delay_cache_timeout_s(
+                    metrics_interval_s, for_handle=True
                 ),
             )
 
diff --git a/python/ray/serve/tests/unit/test_controller.py b/python/ray/serve/tests/unit/test_controller.py
index faf6b020e2c2..9c2cdf2e965d 100644
--- a/python/ray/serve/tests/unit/test_controller.py
+++ b/python/ray/serve/tests/unit/test_controller.py
@@ -1,12 +1,22 @@
 from copy import deepcopy
+from unittest.mock import MagicMock
 
 import pytest
 
 from ray.serve._private.common import TargetCapacityDirection
+from ray.serve._private.constants import (
+    RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
+    RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S,
+    RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
+)
 from ray.serve._private.controller import (
+    ServeController,
+    _get_aggregated_autoscaling_metrics_delay_ms,
+    _get_autoscaling_metrics_delay_cache_timeout_s,
     applications_match,
     calculate_target_capacity_direction,
 )
+from ray.serve.config import AutoscalingConfig
 from ray.serve._private.controller_health_metrics_tracker import (
     _HEALTH_METRICS_HISTORY_SIZE,
     ControllerHealthMetricsTracker,
@@ -714,6 +724,81 @@ def test_component_duration_rolling_window(self):
         assert tracker.dsm_update_durations[0] == 50.0
 
 
+class TestAggregatedAutoscalingMetricsDelay:
+    def test_returns_max_delay_across_sources(self):
+        delay_by_source = {}
+        assert (
+            _get_aggregated_autoscaling_metrics_delay_ms(
+                delay_by_source, "source1", 100.0, timeout_s=10.0
+            )
+            == 100.0
+        )
+        assert (
+            _get_aggregated_autoscaling_metrics_delay_ms(
+                delay_by_source, "source2", 50.0, timeout_s=10.0
+            )
+            == 100.0
+        )
+
+    def test_expires_stale_sources_using_timeout(self):
+        delay_by_source = {}
+        now = 1000.0
+
+        with pytest.MonkeyPatch.context() as monkeypatch:
+            monkeypatch.setattr(
+                "ray.serve._private.controller.time.time", lambda: now
+            )
+            _get_aggregated_autoscaling_metrics_delay_ms(
+                delay_by_source, "slow_source", 200.0, timeout_s=1.0
+            )
+
+            monkeypatch.setattr(
+                "ray.serve._private.controller.time.time", lambda: now + 0.5
+            )
+            assert (
+                _get_aggregated_autoscaling_metrics_delay_ms(
+                    delay_by_source, "fast_source", 50.0, timeout_s=1.0
+                )
+                == 200.0
+            )
+
+            monkeypatch.setattr(
+                "ray.serve._private.controller.time.time", lambda: now + 1.5
+            )
+            assert (
+                _get_aggregated_autoscaling_metrics_delay_ms(
+                    delay_by_source, "fast_source", 50.0, timeout_s=1.0
+                )
+                == 50.0
+            )
+
+    def test_cache_timeout_uses_deployment_metrics_interval(self):
+        assert _get_autoscaling_metrics_delay_cache_timeout_s(30.0) == 60.0
+        assert _get_autoscaling_metrics_delay_cache_timeout_s(
+            30.0, for_handle=True
+        ) == max(60.0, RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S)
+
+    def test_deployment_metrics_interval_lookup(self):
+        controller = ServeController.__new__(ServeController)
+        deployment_config = MagicMock()
+        deployment_config.autoscaling_config = AutoscalingConfig(
+            metrics_interval_s=30.0
+        )
+        controller.get_deployment_config = MagicMock(return_value=deployment_config)
+
+        assert controller._get_deployment_metrics_interval_s(
+            "dep", "app", for_handle=False
+        ) == 30.0
+
+        controller.get_deployment_config = MagicMock(return_value=None)
+        assert controller._get_deployment_metrics_interval_s(
+            "dep", "app", for_handle=False
+        ) == RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+        assert controller._get_deployment_metrics_interval_s(
+            "dep", "app", for_handle=True
+        ) == RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+
+
 if __name__ == "__main__":
     import sys
 

From 51df080f6d15e63c9fa809659e699d661e21d0ed Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Mon, 15 Jun 2026 16:46:45 +0000
Subject: [PATCH 20/23] Lint/black change

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 .../ray/serve/tests/unit/test_controller.py   | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/ray/serve/tests/unit/test_controller.py b/python/ray/serve/tests/unit/test_controller.py
index 9c2cdf2e965d..9f864e91363c 100644
--- a/python/ray/serve/tests/unit/test_controller.py
+++ b/python/ray/serve/tests/unit/test_controller.py
@@ -16,11 +16,11 @@
     applications_match,
     calculate_target_capacity_direction,
 )
-from ray.serve.config import AutoscalingConfig
 from ray.serve._private.controller_health_metrics_tracker import (
     _HEALTH_METRICS_HISTORY_SIZE,
     ControllerHealthMetricsTracker,
 )
+from ray.serve.config import AutoscalingConfig
 from ray.serve.schema import (
     ControllerHealthMetrics,
     DurationStats,
@@ -745,9 +745,7 @@ def test_expires_stale_sources_using_timeout(self):
         now = 1000.0
 
         with pytest.MonkeyPatch.context() as monkeypatch:
-            monkeypatch.setattr(
-                "ray.serve._private.controller.time.time", lambda: now
-            )
+            monkeypatch.setattr("ray.serve._private.controller.time.time", lambda: now)
             _get_aggregated_autoscaling_metrics_delay_ms(
                 delay_by_source, "slow_source", 200.0, timeout_s=1.0
             )
@@ -786,17 +784,24 @@ def test_deployment_metrics_interval_lookup(self):
         )
         controller.get_deployment_config = MagicMock(return_value=deployment_config)
 
-        assert controller._get_deployment_metrics_interval_s(
-            "dep", "app", for_handle=False
-        ) == 30.0
+        assert (
+            controller._get_deployment_metrics_interval_s(
+                "dep", "app", for_handle=False
+            )
+            == 30.0
+        )
 
         controller.get_deployment_config = MagicMock(return_value=None)
-        assert controller._get_deployment_metrics_interval_s(
-            "dep", "app", for_handle=False
-        ) == RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
-        assert controller._get_deployment_metrics_interval_s(
-            "dep", "app", for_handle=True
-        ) == RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+        assert (
+            controller._get_deployment_metrics_interval_s(
+                "dep", "app", for_handle=False
+            )
+            == RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+        )
+        assert (
+            controller._get_deployment_metrics_interval_s("dep", "app", for_handle=True)
+            == RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S
+        )
 
 
 if __name__ == "__main__":

From 845adc31fc7af9fc511dc283f36568f325cb730b Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 16 Jun 2026 14:22:59 +0000
Subject: [PATCH 21/23] Added comment to clarify need, cleaned up duplicate
 code

Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/controller.py | 78 ++++++++++++++++---------
 1 file changed, 50 insertions(+), 28 deletions(-)

diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index d8c57bea3ff1..cbd21b25c532 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -415,6 +415,40 @@ def _get_deployment_metrics_interval_s(
             else RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
         )
 
+    def _autoscaling_metrics_delay_ms(
+        self,
+        delay_cache: Dict[Tuple[str, str], Dict[str, Tuple[float, float]]],
+        deployment: str,
+        application: str,
+        source_id: str,
+        latency_ms: float,
+        *,
+        for_handle: bool,
+    ) -> float:
+        """Delay value to report on the (deployment, application) delay gauge.
+
+        With the high-cardinality replica/handle tag enabled, each source has
+        its own time series, so we report its own latency directly. With the
+        tag disabled, all sources of a deployment share one series; reporting a
+        single source latency would be last-writer-wins and flap, so we
+        aggregate to the max delay across sources that reported within the cache
+        timeout, evicting stale/dead sources so they do not pin the max forever.
+        """
+        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
+            return latency_ms
+
+        metrics_interval_s = self._get_deployment_metrics_interval_s(
+            deployment, application, for_handle=for_handle
+        )
+        return _get_aggregated_autoscaling_metrics_delay_ms(
+            delay_cache.setdefault((deployment, application), {}),
+            source_id,
+            latency_ms,
+            _get_autoscaling_metrics_delay_cache_timeout_s(
+                metrics_interval_s, for_handle=for_handle
+            ),
+        )
+
     def record_autoscaling_metrics_from_replica(
         self, replica_metric_report: Union[ReplicaMetricReport, bytes]
     ):
@@ -431,21 +465,14 @@ def record_autoscaling_metrics_from_replica(
         if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             tags["replica"] = replica_metric_report.replica_id.unique_id
 
-        metrics_delay_ms = latency_ms
-        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-            metrics_interval_s = self._get_deployment_metrics_interval_s(
-                deployment, application, for_handle=False
-            )
-            metrics_delay_ms = _get_aggregated_autoscaling_metrics_delay_ms(
-                self._replica_metrics_delay_ms.setdefault(
-                    (deployment, application), {}
-                ),
-                replica_metric_report.replica_id.unique_id,
-                latency_ms,
-                _get_autoscaling_metrics_delay_cache_timeout_s(
-                    metrics_interval_s, for_handle=False
-                ),
-            )
+        metrics_delay_ms = self._autoscaling_metrics_delay_ms(
+            self._replica_metrics_delay_ms,
+            deployment,
+            application,
+            replica_metric_report.replica_id.unique_id,
+            latency_ms,
+            for_handle=False,
+        )
 
         # Record the metrics delay for observability
         self.replica_metrics_delay_gauge.set(
@@ -474,19 +501,14 @@ def record_autoscaling_metrics_from_handle(
         if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
             tags["handle"] = handle_metric_report.handle_id
 
-        metrics_delay_ms = latency_ms
-        if not RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-            metrics_interval_s = self._get_deployment_metrics_interval_s(
-                deployment, application, for_handle=True
-            )
-            metrics_delay_ms = _get_aggregated_autoscaling_metrics_delay_ms(
-                self._handle_metrics_delay_ms.setdefault((deployment, application), {}),
-                handle_metric_report.handle_id,
-                latency_ms,
-                _get_autoscaling_metrics_delay_cache_timeout_s(
-                    metrics_interval_s, for_handle=True
-                ),
-            )
+        metrics_delay_ms = self._autoscaling_metrics_delay_ms(
+            self._handle_metrics_delay_ms,
+            deployment,
+            application,
+            handle_metric_report.handle_id,
+            latency_ms,
+            for_handle=True,
+        )
 
         # Record the metrics delay for observability
         self.handle_metrics_delay_gauge.set(

From a10d8caeae4cfd28b550441301a6e1199452da00 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 16 Jun 2026 14:41:16 +0000
Subject: [PATCH 22/23] Emit autoscaling metrics delay as a histogram

Replace the per-replica/handle delay gauges with histograms and drop the
high-cardinality replica/handle tag. Prometheus aggregates the per-source
observations into the (deployment, application) series, so the app-side
max-with-eviction aggregation cache is no longer needed and is removed.
This mirrors serve_routing_stats_delay_ms in deployment_state.py.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 python/ray/serve/_private/controller.py       | 167 +++---------------
 python/ray/serve/tests/test_metrics.py        |  21 +--
 python/ray/serve/tests/test_metrics_3.py      |  58 ++----
 .../ray/serve/tests/unit/test_controller.py   |  90 ----------
 4 files changed, 41 insertions(+), 295 deletions(-)

diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
index cbd21b25c532..910bbf5fede7 100644
--- a/python/ray/serve/_private/controller.py
+++ b/python/ray/serve/_private/controller.py
@@ -35,14 +35,11 @@
 from ray.serve._private.config import DeploymentConfig
 from ray.serve._private.constants import (
     CONTROL_LOOP_INTERVAL_S,
+    DEFAULT_LATENCY_BUCKET_MS,
     RAY_SERVE_CONTROLLER_CALLBACK_IMPORT_PATH,
-    RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS,
     RAY_SERVE_ENABLE_DIRECT_INGRESS,
     RAY_SERVE_ENABLE_HA_PROXY,
-    RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
     RAY_SERVE_LOG_TO_STDERR,
-    RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S,
-    RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
     RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE,
     RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP,
     RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD,
@@ -128,36 +125,6 @@
 SHUTDOWN_IN_PROGRESS_KEY = "serve-shutdown-in-progress"
 
 
-def _get_aggregated_autoscaling_metrics_delay_ms(
-    delay_by_source: Dict[str, Tuple[float, float]],
-    source_id: str,
-    delay_ms: float,
-    timeout_s: float,
-) -> float:
-    """Return the max latest metrics delay for a deployment/application series."""
-
-    now = time.time()
-    delay_by_source[source_id] = (delay_ms, now)
-    for cached_source_id, (_, updated_at) in list(delay_by_source.items()):
-        if now - updated_at > timeout_s:
-            del delay_by_source[cached_source_id]
-
-    return max(cached_delay_ms for cached_delay_ms, _ in delay_by_source.values())
-
-
-def _get_autoscaling_metrics_delay_cache_timeout_s(
-    metrics_interval_s: float,
-    *,
-    for_handle: bool = False,
-) -> float:
-    """Return cache TTL aligned with per-deployment metrics push interval."""
-
-    timeout_s = 2 * metrics_interval_s
-    if for_handle:
-        timeout_s = max(timeout_s, RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S)
-    return timeout_s
-
-
 class ServeController:
     """Responsible for managing the state of the serving system.
 
@@ -321,12 +288,6 @@ async def __init__(
         self._health_metrics_tracker = ControllerHealthMetricsTracker(
             controller_start_time=time.time()
         )
-        self._replica_metrics_delay_ms: Dict[
-            Tuple[str, str], Dict[str, Tuple[float, float]]
-        ] = {}
-        self._handle_metrics_delay_ms: Dict[
-            Tuple[str, str], Dict[str, Tuple[float, float]]
-        ] = {}
 
         self._create_control_loop_metrics()
         run_background_task(self.run_control_loop())
@@ -397,58 +358,6 @@ def check_alive(self) -> None:
     def get_pid(self) -> int:
         return os.getpid()
 
-    def _get_deployment_metrics_interval_s(
-        self, deployment: str, application: str, *, for_handle: bool
-    ) -> float:
-        deployment_config = self.get_deployment_config(
-            DeploymentID(name=deployment, app_name=application)
-        )
-        if (
-            deployment_config is not None
-            and deployment_config.autoscaling_config is not None
-        ):
-            return deployment_config.autoscaling_config.metrics_interval_s
-
-        return (
-            RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S
-            if for_handle
-            else RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
-        )
-
-    def _autoscaling_metrics_delay_ms(
-        self,
-        delay_cache: Dict[Tuple[str, str], Dict[str, Tuple[float, float]]],
-        deployment: str,
-        application: str,
-        source_id: str,
-        latency_ms: float,
-        *,
-        for_handle: bool,
-    ) -> float:
-        """Delay value to report on the (deployment, application) delay gauge.
-
-        With the high-cardinality replica/handle tag enabled, each source has
-        its own time series, so we report its own latency directly. With the
-        tag disabled, all sources of a deployment share one series; reporting a
-        single source latency would be last-writer-wins and flap, so we
-        aggregate to the max delay across sources that reported within the cache
-        timeout, evicting stale/dead sources so they do not pin the max forever.
-        """
-        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-            return latency_ms
-
-        metrics_interval_s = self._get_deployment_metrics_interval_s(
-            deployment, application, for_handle=for_handle
-        )
-        return _get_aggregated_autoscaling_metrics_delay_ms(
-            delay_cache.setdefault((deployment, application), {}),
-            source_id,
-            latency_ms,
-            _get_autoscaling_metrics_delay_cache_timeout_s(
-                metrics_interval_s, for_handle=for_handle
-            ),
-        )
-
     def record_autoscaling_metrics_from_replica(
         self, replica_metric_report: Union[ReplicaMetricReport, bytes]
     ):
@@ -458,26 +367,16 @@ def record_autoscaling_metrics_from_replica(
         latency_ms = latency * 1000
         deployment = replica_metric_report.replica_id.deployment_id.name
         application = replica_metric_report.replica_id.deployment_id.app_name
-        tags = {
-            "deployment": deployment,
-            "application": application,
-        }
-        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-            tags["replica"] = replica_metric_report.replica_id.unique_id
-
-        metrics_delay_ms = self._autoscaling_metrics_delay_ms(
-            self._replica_metrics_delay_ms,
-            deployment,
-            application,
-            replica_metric_report.replica_id.unique_id,
-            latency_ms,
-            for_handle=False,
-        )
 
-        # Record the metrics delay for observability
-        self.replica_metrics_delay_gauge.set(
-            metrics_delay_ms,
-            tags=tags,
+        # Record the metrics delay for observability. A histogram lets Prometheus
+        # aggregate reports from all replicas of a deployment, so we omit the
+        # per-replica tag to keep cardinality bounded.
+        self.replica_metrics_delay_histogram.observe(
+            latency_ms,
+            tags={
+                "deployment": deployment,
+                "application": application,
+            },
         )
         # Track in health metrics
         self._health_metrics_tracker.record_replica_metrics_delay(latency_ms)
@@ -494,26 +393,16 @@ def record_autoscaling_metrics_from_handle(
         latency_ms = latency * 1000
         deployment = handle_metric_report.deployment_id.name
         application = handle_metric_report.deployment_id.app_name
-        tags = {
-            "deployment": deployment,
-            "application": application,
-        }
-        if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS:
-            tags["handle"] = handle_metric_report.handle_id
-
-        metrics_delay_ms = self._autoscaling_metrics_delay_ms(
-            self._handle_metrics_delay_ms,
-            deployment,
-            application,
-            handle_metric_report.handle_id,
-            latency_ms,
-            for_handle=True,
-        )
 
-        # Record the metrics delay for observability
-        self.handle_metrics_delay_gauge.set(
-            metrics_delay_ms,
-            tags=tags,
+        # Record the metrics delay for observability. A histogram lets Prometheus
+        # aggregate reports from all handles of a deployment, so we omit the
+        # per-handle tag to keep cardinality bounded.
+        self.handle_metrics_delay_histogram.observe(
+            latency_ms,
+            tags={
+                "deployment": deployment,
+                "application": application,
+            },
         )
         # Track in health metrics
         self._health_metrics_tracker.record_handle_metrics_delay(latency_ms)
@@ -874,29 +763,23 @@ def _create_control_loop_metrics(self):
         )
 
         # Autoscaling metrics delay gauges
-        self.replica_metrics_delay_gauge = metrics.Gauge(
+        self.replica_metrics_delay_histogram = metrics.Histogram(
             "serve_autoscaling_replica_metrics_delay_ms",
             description=(
                 "Time taken for the replica metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=(
-                ("deployment", "application", "replica")
-                if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS
-                else ("deployment", "application")
-            ),
+            boundaries=DEFAULT_LATENCY_BUCKET_MS,
+            tag_keys=("deployment", "application"),
         )
-        self.handle_metrics_delay_gauge = metrics.Gauge(
+        self.handle_metrics_delay_histogram = metrics.Histogram(
             "serve_autoscaling_handle_metrics_delay_ms",
             description=(
                 "Time taken for the handle metrics to be reported to the controller. "
                 "High values may indicate a busy controller."
             ),
-            tag_keys=(
-                ("deployment", "application", "handle")
-                if RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS
-                else ("deployment", "application")
-            ),
+            boundaries=DEFAULT_LATENCY_BUCKET_MS,
+            tag_keys=("deployment", "application"),
         )
         self.async_inference_task_queue_metrics_delay_gauge = metrics.Gauge(
             "serve_autoscaling_async_inference_task_queue_metrics_delay_ms",
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index 6f96330521f9..bc833587eb4e 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -1410,32 +1410,13 @@ def check_controller_metric_tags():
             lifecycle_deployment_name,
             lifecycle_app_name,
         )
-        handle_metrics = get_matching_metrics(
-            "ray_serve_autoscaling_handle_metrics_delay_ms",
-            autoscaling_deployment_name,
-            autoscaling_app_name,
-        )
-        replica_metrics = get_matching_metrics(
-            "ray_serve_autoscaling_replica_metrics_delay_ms",
-            autoscaling_deployment_name,
-            autoscaling_app_name,
-        )
-        if (
-            not health_failure_metrics
-            or not health_status_metrics
-            or not handle_metrics
-            or not replica_metrics
-        ):
+        if not health_failure_metrics or not health_status_metrics:
             return False
 
         for metric in health_failure_metrics:
             assert_high_cardinality_tag(metric, "replica")
         for metric in health_status_metrics:
             assert_high_cardinality_tag(metric, "replica")
-        for metric in handle_metrics:
-            assert_high_cardinality_tag(metric, "handle")
-        for metric in replica_metrics:
-            assert_high_cardinality_tag(metric, "replica")
 
         return True
 
diff --git a/python/ray/serve/tests/test_metrics_3.py b/python/ray/serve/tests/test_metrics_3.py
index f607e9951c4b..61b036c597d1 100644
--- a/python/ray/serve/tests/test_metrics_3.py
+++ b/python/ray/serve/tests/test_metrics_3.py
@@ -497,9 +497,9 @@ def test_autoscaling_metrics(metrics_start_shutdown):
         requests per replica
         Tags: deployment, application
     - ray_serve_autoscaling_replica_metrics_delay_ms: Replica metrics delay
-        Tags: deployment, application, replica
+        Tags: deployment, application
     - ray_serve_autoscaling_handle_metrics_delay_ms: Handle metrics delay
-        Tags: deployment, application, handle
+        Tags: deployment, application
     """
     signal = SignalActor.remote()
 
@@ -589,42 +589,17 @@ def check_policy_execution_time_metric():
     )
     print("Target ongoing requests metric verified.")
 
-    # Test 6: Check that metrics delay gauges are emitted with proper tags
+    # Test 6: Check that the metrics delay histograms are emitted. These are
+    # aggregated by Prometheus across all sources, so they carry no
+    # per-replica/handle tag.
     def check_metrics_delay_metrics():
-        # Check for handle metrics delay (depends on where metrics are collected)
-        value = get_metric_float(
-            "ray_serve_autoscaling_handle_metrics_delay_ms",
-            expected_tags=base_tags,
-            timeseries=timeseries,
-        )
-        if value >= 0:
-            # Verify handle tag exists by checking metric dictionaries
-            metrics_dicts = get_metric_dictionaries(
-                "ray_serve_autoscaling_handle_metrics_delay_ms",
-                timeout=5,
-                timeseries=timeseries,
-                wait=False,
-            )
-            for m in metrics_dicts:
-                if (
-                    m.get("deployment") == "AutoscalingDeployment"
-                    and m.get("application") == "autoscaling_app"
-                ):
-                    assert m.get("handle") is not None
-                    print(
-                        f"Handle delay metric verified with handle tag: {m.get('handle')}"
-                    )
-                    return True
-
-        # Fallback: Check for replica metrics delay
-        value = get_metric_float(
-            "ray_serve_autoscaling_replica_metrics_delay_ms",
-            expected_tags=base_tags,
-            timeseries=timeseries,
-        )
-        if value >= 0:
+        found = False
+        for metric_name in (
+            "ray_serve_autoscaling_handle_metrics_delay_ms_count",
+            "ray_serve_autoscaling_replica_metrics_delay_ms_count",
+        ):
             metrics_dicts = get_metric_dictionaries(
-                "ray_serve_autoscaling_replica_metrics_delay_ms",
+                metric_name,
                 timeout=5,
                 timeseries=timeseries,
                 wait=False,
@@ -634,13 +609,10 @@ def check_metrics_delay_metrics():
                     m.get("deployment") == "AutoscalingDeployment"
                     and m.get("application") == "autoscaling_app"
                 ):
-                    assert m.get("replica") is not None
-                    print(
-                        f"Replica delay metric verified with replica tag: {m.get('replica')}"
-                    )
-                    return True
-
-        return False
+                    assert "replica" not in m
+                    assert "handle" not in m
+                    found = True
+        return found
 
     wait_for_condition(check_metrics_delay_metrics, timeout=15)
     print("Metrics delay metrics verified.")
diff --git a/python/ray/serve/tests/unit/test_controller.py b/python/ray/serve/tests/unit/test_controller.py
index 9f864e91363c..faf6b020e2c2 100644
--- a/python/ray/serve/tests/unit/test_controller.py
+++ b/python/ray/serve/tests/unit/test_controller.py
@@ -1,18 +1,9 @@
 from copy import deepcopy
-from unittest.mock import MagicMock
 
 import pytest
 
 from ray.serve._private.common import TargetCapacityDirection
-from ray.serve._private.constants import (
-    RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
-    RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S,
-    RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S,
-)
 from ray.serve._private.controller import (
-    ServeController,
-    _get_aggregated_autoscaling_metrics_delay_ms,
-    _get_autoscaling_metrics_delay_cache_timeout_s,
     applications_match,
     calculate_target_capacity_direction,
 )
@@ -20,7 +11,6 @@
     _HEALTH_METRICS_HISTORY_SIZE,
     ControllerHealthMetricsTracker,
 )
-from ray.serve.config import AutoscalingConfig
 from ray.serve.schema import (
     ControllerHealthMetrics,
     DurationStats,
@@ -724,86 +714,6 @@ def test_component_duration_rolling_window(self):
         assert tracker.dsm_update_durations[0] == 50.0
 
 
-class TestAggregatedAutoscalingMetricsDelay:
-    def test_returns_max_delay_across_sources(self):
-        delay_by_source = {}
-        assert (
-            _get_aggregated_autoscaling_metrics_delay_ms(
-                delay_by_source, "source1", 100.0, timeout_s=10.0
-            )
-            == 100.0
-        )
-        assert (
-            _get_aggregated_autoscaling_metrics_delay_ms(
-                delay_by_source, "source2", 50.0, timeout_s=10.0
-            )
-            == 100.0
-        )
-
-    def test_expires_stale_sources_using_timeout(self):
-        delay_by_source = {}
-        now = 1000.0
-
-        with pytest.MonkeyPatch.context() as monkeypatch:
-            monkeypatch.setattr("ray.serve._private.controller.time.time", lambda: now)
-            _get_aggregated_autoscaling_metrics_delay_ms(
-                delay_by_source, "slow_source", 200.0, timeout_s=1.0
-            )
-
-            monkeypatch.setattr(
-                "ray.serve._private.controller.time.time", lambda: now + 0.5
-            )
-            assert (
-                _get_aggregated_autoscaling_metrics_delay_ms(
-                    delay_by_source, "fast_source", 50.0, timeout_s=1.0
-                )
-                == 200.0
-            )
-
-            monkeypatch.setattr(
-                "ray.serve._private.controller.time.time", lambda: now + 1.5
-            )
-            assert (
-                _get_aggregated_autoscaling_metrics_delay_ms(
-                    delay_by_source, "fast_source", 50.0, timeout_s=1.0
-                )
-                == 50.0
-            )
-
-    def test_cache_timeout_uses_deployment_metrics_interval(self):
-        assert _get_autoscaling_metrics_delay_cache_timeout_s(30.0) == 60.0
-        assert _get_autoscaling_metrics_delay_cache_timeout_s(
-            30.0, for_handle=True
-        ) == max(60.0, RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S)
-
-    def test_deployment_metrics_interval_lookup(self):
-        controller = ServeController.__new__(ServeController)
-        deployment_config = MagicMock()
-        deployment_config.autoscaling_config = AutoscalingConfig(
-            metrics_interval_s=30.0
-        )
-        controller.get_deployment_config = MagicMock(return_value=deployment_config)
-
-        assert (
-            controller._get_deployment_metrics_interval_s(
-                "dep", "app", for_handle=False
-            )
-            == 30.0
-        )
-
-        controller.get_deployment_config = MagicMock(return_value=None)
-        assert (
-            controller._get_deployment_metrics_interval_s(
-                "dep", "app", for_handle=False
-            )
-            == RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S
-        )
-        assert (
-            controller._get_deployment_metrics_interval_s("dep", "app", for_handle=True)
-            == RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S
-        )
-
-
 if __name__ == "__main__":
     import sys
 

From cb6edafb3373999bc81c497b5510edde8a842ba0 Mon Sep 17 00:00:00 2001
From: "john.taylor" <john.taylor@anyscale.com>
Date: Tue, 16 Jun 2026 19:46:34 +0000
Subject: [PATCH 23/23] Update docs for histogram autoscaling delay metrics

The autoscaling delay metrics are now histograms aggregated server-side by
Prometheus and no longer carry the per-replica/handle tag, so they are not
controlled by RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS.
Update monitoring.md (type Gauge -> Histogram, drop the source tag, fix the
section preamble) and drop the stale "handle IDs" mention from the flag
docstring (no controller metric carries a handle tag anymore). The flag still
governs the replica tag on lifecycle/health metrics.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: john.taylor <john.taylor@anyscale.com>
---
 doc/source/serve/monitoring.md         | 16 +++++++---------
 python/ray/serve/_private/constants.py |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
index 63996e56e430..18ce7e57f13e 100644
--- a/doc/source/serve/monitoring.md
+++ b/doc/source/serve/monitoring.md
@@ -762,13 +762,11 @@ This setting doesn't affect replica-emitted metrics such as
 
 These metrics provide visibility into autoscaling behavior and help debug scaling issues.
 
-By default, controller-emitted autoscaling delay metrics include source
-identifiers such as `handle` and `replica` where applicable. For large deployments, set
-`RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS=0` to drop those
-source-level high-cardinality tags while retaining `deployment` and `application`.
-This setting doesn't affect replica-emitted metrics such as
-`ray_serve_record_autoscaling_stats_failed_total` or
-`ray_serve_user_autoscaling_stats_latency_ms`.
+The autoscaling delay metrics `ray_serve_autoscaling_replica_metrics_delay_ms`
+and `ray_serve_autoscaling_handle_metrics_delay_ms` are **histograms** labeled
+only by `deployment` and `application`. Prometheus aggregates the per-replica
+and per-handle observations server-side, so no source-level tag is emitted and
+cardinality stays bounded at scale.
 
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
@@ -777,8 +775,8 @@ This setting doesn't affect replica-emitted metrics such as
 | `ray_serve_autoscaling_total_requests` | Gauge | `deployment`, `application` | Total number of requests (queued + in-flight) as seen by the autoscaler. This is the input to the scaling decision. |
 | `ray_serve_autoscaling_target_ongoing_requests` | Gauge | `deployment`, `application` | Configured `target_ongoing_requests` per replica. Divide `ray_serve_autoscaling_total_requests` by this value to compute the raw desired replica count for the default policy and detect autoscaling regressions. |
 | `ray_serve_autoscaling_policy_execution_time_ms` | Gauge | `deployment`, `application`, `policy_scope` | Time taken to execute the autoscaling policy in milliseconds. `policy_scope` is `deployment` or `application`. |
-| `ray_serve_autoscaling_replica_metrics_delay_ms` | Gauge | `deployment`, `application`, `replica` | Time taken for replica metrics to reach the controller in milliseconds. High values may indicate controller overload. |
-| `ray_serve_autoscaling_handle_metrics_delay_ms` | Gauge | `deployment`, `application`, `handle` | Time taken for handle metrics to reach the controller in milliseconds. High values may indicate controller overload. |
+| `ray_serve_autoscaling_replica_metrics_delay_ms` | Histogram | `deployment`, `application` | Time taken for replica metrics to reach the controller in milliseconds. High values may indicate controller overload. |
+| `ray_serve_autoscaling_handle_metrics_delay_ms` | Histogram | `deployment`, `application` | Time taken for handle metrics to reach the controller in milliseconds. High values may indicate controller overload. |
 | `ray_serve_autoscaling_async_inference_task_queue_metrics_delay_ms` | Gauge | `deployment`, `application` | Time taken for async inference task queue metrics (from QueueMonitor) to reach the controller in milliseconds. |
 | `ray_serve_record_autoscaling_stats_failed_total` | Counter | `application`, `deployment`, `replica`, `exception_name` | Total number of failed attempts to collect autoscaling metrics on replica from user defined function. Non-zero values indicate error in user code. |
 | `ray_serve_user_autoscaling_stats_latency_ms` | Histogram | `application`, `deployment`, `replica` | Histogram of time taken to execute the user-defined autoscaling stats function in milliseconds. |
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 168b2289741c..44b52fd7ea68 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -1012,7 +1012,7 @@
 
 # Feature flag to include high-cardinality source tags on Serve controller metrics.
 # Disable this to keep deployment/application tags while dropping source identifiers
-# like replica IDs and handle IDs from controller-emitted metrics.
+# like replica IDs from controller-emitted metrics.
 RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS = get_env_bool(
     "RAY_SERVE_CONTROLLER_METRICS_INCLUDE_HIGH_CARDINALITY_TAGS", "1"
 )